1#[cfg(test)]
6#[cfg(feature = "std")]
7mod test;
8mod token;
9
10#[cfg(feature = "alloc")]
11use alloc::vec::Vec;
12
13use winnow::stream::AsBStr as _;
14use winnow::stream::ContainsToken as _;
15use winnow::stream::FindSlice as _;
16use winnow::stream::Location;
17use winnow::stream::Stream as _;
18
19use crate::Span;
20
21pub use token::Token;
22pub use token::TokenKind;
23
24pub struct Lexer<'i> {
28 stream: Stream<'i>,
29 eof: bool,
30}
31
32impl<'i> Lexer<'i> {
33 pub(crate) fn new(input: &'i str) -> Self {
34 let mut stream = Stream::new(input);
35 if input.as_bytes().starts_with(BOM) {
36 let offset = BOM.len();
37 #[cfg(feature = "unsafe")] unsafe {
39 stream.next_slice_unchecked(offset)
40 };
41 #[cfg(not(feature = "unsafe"))]
42 stream.next_slice(offset);
43 }
44 Lexer { stream, eof: false }
45 }
46
47 #[cfg(feature = "alloc")]
48 pub fn into_vec(self) -> Vec<Token> {
49 #![allow(unused_qualifications)] let capacity = core::cmp::min(
51 self.stream.len(),
52 usize::MAX / core::mem::size_of::<Token>(),
53 );
54 let mut vec = Vec::with_capacity(capacity);
55 vec.extend(self);
56 vec
57 }
58}
59
60impl Iterator for Lexer<'_> {
61 type Item = Token;
62
63 fn next(&mut self) -> Option<Self::Item> {
64 let Some(peek_byte) = self.stream.as_bstr().first() else {
65 if self.eof {
66 return None;
67 } else {
68 self.eof = true;
69 let start = self.stream.current_token_start();
70 let span = Span::new_unchecked(start, start);
71 return Some(Token::new(TokenKind::Eof, span));
72 }
73 };
74 Some(process_token(*peek_byte, &mut self.stream))
75 }
76}
77
78const BOM: &[u8] = b"\xEF\xBB\xBF";
79
80pub(crate) type Stream<'i> = winnow::stream::LocatingSlice<&'i str>;
81
82fn process_token(peek_byte: u8, stream: &mut Stream<'_>) -> Token {
83 let token = match peek_byte {
84 b'.' => lex_ascii_char(stream, TokenKind::Dot),
85 b'=' => lex_ascii_char(stream, TokenKind::Equals),
86 b',' => lex_ascii_char(stream, TokenKind::Comma),
87 b'[' => lex_ascii_char(stream, TokenKind::LeftSquareBracket),
88 b']' => lex_ascii_char(stream, TokenKind::RightSquareBracket),
89 b'{' => lex_ascii_char(stream, TokenKind::LeftCurlyBracket),
90 b'}' => lex_ascii_char(stream, TokenKind::RightCurlyBracket),
91 b' ' => lex_whitespace(stream),
92 b'\t' => lex_whitespace(stream),
93 b'#' => lex_comment(stream),
94 b'\r' => lex_crlf(stream),
95 b'\n' => lex_ascii_char(stream, TokenKind::Newline),
96 b'\'' => {
97 if stream.starts_with(ML_LITERAL_STRING_DELIM) {
98 lex_ml_literal_string(stream)
99 } else {
100 lex_literal_string(stream)
101 }
102 }
103 b'"' => {
104 if stream.starts_with(ML_BASIC_STRING_DELIM) {
105 lex_ml_basic_string(stream)
106 } else {
107 lex_basic_string(stream)
108 }
109 }
110 _ => lex_atom(stream),
111 };
112 token
113}
114
115fn lex_ascii_char(stream: &mut Stream<'_>, kind: TokenKind) -> Token {
123 debug_assert!(!stream.is_empty());
124 let start = stream.current_token_start();
125
126 let offset = 1; #[cfg(feature = "unsafe")] unsafe {
129 stream.next_slice_unchecked(offset)
130 };
131 #[cfg(not(feature = "unsafe"))]
132 stream.next_slice(offset);
133
134 let end = stream.previous_token_end();
135 let span = Span::new_unchecked(start, end);
136 Token::new(kind, span)
137}
138
139fn lex_whitespace(stream: &mut Stream<'_>) -> Token {
154 debug_assert!(!stream.is_empty());
155 let start = stream.current_token_start();
156
157 let offset = stream
158 .as_bstr()
159 .offset_for(|b| !WSCHAR.contains_token(b))
160 .unwrap_or(stream.eof_offset());
161 #[cfg(feature = "unsafe")] unsafe {
163 stream.next_slice_unchecked(offset)
164 };
165 #[cfg(not(feature = "unsafe"))]
166 stream.next_slice(offset);
167
168 let end = stream.previous_token_end();
169 let span = Span::new_unchecked(start, end);
170 Token::new(TokenKind::Whitespace, span)
171}
172
173pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');
178
179fn lex_comment(stream: &mut Stream<'_>) -> Token {
196 let start = stream.current_token_start();
197
198 let offset = stream
199 .as_bytes()
200 .find_slice((b'\r', b'\n'))
201 .map(|s| s.start)
202 .unwrap_or_else(|| stream.eof_offset());
203 #[cfg(feature = "unsafe")] unsafe {
205 stream.next_slice_unchecked(offset)
206 };
207 #[cfg(not(feature = "unsafe"))]
208 stream.next_slice(offset);
209
210 let end = stream.previous_token_end();
211 let span = Span::new_unchecked(start, end);
212 Token::new(TokenKind::Comment, span)
213}
214
215pub(crate) const COMMENT_START_SYMBOL: u8 = b'#';
219
220fn lex_crlf(stream: &mut Stream<'_>) -> Token {
234 let start = stream.current_token_start();
235
236 let mut offset = '\r'.len_utf8();
237 let has_lf = stream.as_bstr().get(1) == Some(&b'\n');
238 if has_lf {
239 offset += '\n'.len_utf8();
240 }
241
242 #[cfg(feature = "unsafe")] unsafe {
244 stream.next_slice_unchecked(offset)
245 };
246 #[cfg(not(feature = "unsafe"))]
247 stream.next_slice(offset);
248 let end = stream.previous_token_end();
249 let span = Span::new_unchecked(start, end);
250
251 Token::new(TokenKind::Newline, span)
252}
253
254fn lex_literal_string(stream: &mut Stream<'_>) -> Token {
271 let start = stream.current_token_start();
272
273 let offset = 1; #[cfg(feature = "unsafe")] unsafe {
276 stream.next_slice_unchecked(offset)
277 };
278 #[cfg(not(feature = "unsafe"))]
279 stream.next_slice(offset);
280
281 let offset = match stream.as_bstr().find_slice((APOSTROPHE, b'\n')) {
282 Some(span) => {
283 if stream.as_bstr()[span.start] == APOSTROPHE {
284 span.end
285 } else {
286 span.start
287 }
288 }
289 None => stream.eof_offset(),
290 };
291 #[cfg(feature = "unsafe")]
292 unsafe {
294 stream.next_slice_unchecked(offset)
295 };
296 #[cfg(not(feature = "unsafe"))]
297 stream.next_slice(offset);
298
299 let end = stream.previous_token_end();
300 let span = Span::new_unchecked(start, end);
301 Token::new(TokenKind::LiteralString, span)
302}
303
304pub(crate) const APOSTROPHE: u8 = b'\'';
308
309fn lex_ml_literal_string(stream: &mut Stream<'_>) -> Token {
328 let start = stream.current_token_start();
329
330 let offset = ML_LITERAL_STRING_DELIM.len();
331 #[cfg(feature = "unsafe")] unsafe {
333 stream.next_slice_unchecked(offset)
334 };
335 #[cfg(not(feature = "unsafe"))]
336 stream.next_slice(offset);
337
338 let offset = match stream.as_bstr().find_slice(ML_LITERAL_STRING_DELIM) {
339 Some(span) => span.end,
340 None => stream.eof_offset(),
341 };
342 #[cfg(feature = "unsafe")]
343 unsafe {
345 stream.next_slice_unchecked(offset)
346 };
347 #[cfg(not(feature = "unsafe"))]
348 stream.next_slice(offset);
349
350 if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
351 let offset = 1;
352 #[cfg(feature = "unsafe")] unsafe {
354 stream.next_slice_unchecked(offset)
355 };
356 #[cfg(not(feature = "unsafe"))]
357 stream.next_slice(offset);
358
359 if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
360 let offset = 1;
361 #[cfg(feature = "unsafe")]
362 unsafe {
364 stream.next_slice_unchecked(offset)
365 };
366 #[cfg(not(feature = "unsafe"))]
367 stream.next_slice(offset);
368 }
369 }
370
371 let end = stream.previous_token_end();
372 let span = Span::new_unchecked(start, end);
373 Token::new(TokenKind::MlLiteralString, span)
374}
375
376pub(crate) const ML_LITERAL_STRING_DELIM: &str = "'''";
380
381fn lex_basic_string(stream: &mut Stream<'_>) -> Token {
413 let start = stream.current_token_start();
414
415 let offset = 1; #[cfg(feature = "unsafe")] unsafe {
418 stream.next_slice_unchecked(offset)
419 };
420 #[cfg(not(feature = "unsafe"))]
421 stream.next_slice(offset);
422
423 loop {
424 match stream.as_bstr().find_slice((QUOTATION_MARK, ESCAPE, b'\n')) {
426 Some(span) => {
427 let found = stream.as_bstr()[span.start];
428 if found == QUOTATION_MARK {
429 let offset = span.end;
430 #[cfg(feature = "unsafe")]
431 unsafe {
433 stream.next_slice_unchecked(offset)
434 };
435 #[cfg(not(feature = "unsafe"))]
436 stream.next_slice(offset);
437 break;
438 } else if found == ESCAPE {
439 let offset = span.end;
440 #[cfg(feature = "unsafe")]
441 unsafe {
443 stream.next_slice_unchecked(offset)
444 };
445 #[cfg(not(feature = "unsafe"))]
446 stream.next_slice(offset);
447
448 let peek = stream.as_bstr().peek_token();
449 match peek {
450 Some(ESCAPE) | Some(QUOTATION_MARK) => {
451 let offset = 1; #[cfg(feature = "unsafe")]
453 #[cfg(feature = "unsafe")]
454 unsafe {
456 stream.next_slice_unchecked(offset)
457 };
458 #[cfg(not(feature = "unsafe"))]
459 stream.next_slice(offset);
460 }
461 _ => {}
462 }
463 continue;
464 } else if found == b'\n' {
465 let offset = span.start;
466 #[cfg(feature = "unsafe")]
467 unsafe {
469 stream.next_slice_unchecked(offset)
470 };
471 #[cfg(not(feature = "unsafe"))]
472 stream.next_slice(offset);
473 break;
474 } else {
475 unreachable!("found `{found}`");
476 }
477 }
478 None => {
479 stream.finish();
480 break;
481 }
482 }
483 }
484
485 let end = stream.previous_token_end();
486 let span = Span::new_unchecked(start, end);
487 Token::new(TokenKind::BasicString, span)
488}
489
490pub(crate) const QUOTATION_MARK: u8 = b'"';
494
495pub(crate) const ESCAPE: u8 = b'\\';
499
500fn lex_ml_basic_string(stream: &mut Stream<'_>) -> Token {
520 let start = stream.current_token_start();
521
522 let offset = ML_BASIC_STRING_DELIM.len();
523 #[cfg(feature = "unsafe")] unsafe {
525 stream.next_slice_unchecked(offset)
526 };
527 #[cfg(not(feature = "unsafe"))]
528 stream.next_slice(offset);
529
530 loop {
531 match stream.as_bstr().find_slice((ML_BASIC_STRING_DELIM, "\\")) {
533 Some(span) => {
534 let found = stream.as_bstr()[span.start];
535 if found == QUOTATION_MARK {
536 let offset = span.end;
537 #[cfg(feature = "unsafe")]
538 unsafe {
540 stream.next_slice_unchecked(offset)
541 };
542 #[cfg(not(feature = "unsafe"))]
543 stream.next_slice(offset);
544 break;
545 } else if found == ESCAPE {
546 let offset = span.end;
547 #[cfg(feature = "unsafe")]
548 unsafe {
550 stream.next_slice_unchecked(offset)
551 };
552 #[cfg(not(feature = "unsafe"))]
553 stream.next_slice(offset);
554
555 let peek = stream.as_bstr().peek_token();
556 match peek {
557 Some(ESCAPE) | Some(QUOTATION_MARK) => {
558 let offset = 1; #[cfg(feature = "unsafe")]
560 unsafe {
562 stream.next_slice_unchecked(offset)
563 };
564 #[cfg(not(feature = "unsafe"))]
565 stream.next_slice(offset);
566 }
567 _ => {}
568 }
569 continue;
570 } else {
571 unreachable!("found `{found}`");
572 }
573 }
574 None => {
575 stream.finish();
576 break;
577 }
578 }
579 }
580 if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
581 let offset = 1;
582 #[cfg(feature = "unsafe")]
583 unsafe {
585 stream.next_slice_unchecked(offset)
586 };
587 #[cfg(not(feature = "unsafe"))]
588 stream.next_slice(offset);
589 if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
590 let offset = 1;
591 #[cfg(feature = "unsafe")]
592 unsafe {
594 stream.next_slice_unchecked(offset)
595 };
596 #[cfg(not(feature = "unsafe"))]
597 stream.next_slice(offset);
598 }
599 }
600
601 let end = stream.previous_token_end();
602 let span = Span::new_unchecked(start, end);
603 Token::new(TokenKind::MlBasicString, span)
604}
605
606pub(crate) const ML_BASIC_STRING_DELIM: &str = "\"\"\"";
610
611fn lex_atom(stream: &mut Stream<'_>) -> Token {
620 let start = stream.current_token_start();
621
622 const TOKEN_START: &[u8] = b".=,[]{} \t#\r\n";
624 let offset = stream
625 .as_bstr()
626 .offset_for(|b| TOKEN_START.contains_token(b))
627 .unwrap_or_else(|| stream.eof_offset());
628 #[cfg(feature = "unsafe")] unsafe {
630 stream.next_slice_unchecked(offset)
631 };
632 #[cfg(not(feature = "unsafe"))]
633 stream.next_slice(offset);
634
635 let end = stream.previous_token_end();
636 let span = Span::new_unchecked(start, end);
637 Token::new(TokenKind::Atom, span)
638}