toml_parser/lexer/
mod.rs

1//! Lex TOML tokens
2//!
3//! To get started, see [`Source::lex`][crate::Source::lex]
4
5#[cfg(test)]
6#[cfg(feature = "std")]
7mod test;
8mod token;
9
10#[cfg(feature = "alloc")]
11use alloc::vec::Vec;
12
13use winnow::stream::AsBStr as _;
14use winnow::stream::ContainsToken as _;
15use winnow::stream::FindSlice as _;
16use winnow::stream::Location;
17use winnow::stream::Stream as _;
18
19use crate::Span;
20
21pub use token::Token;
22pub use token::TokenKind;
23
24/// Lex TOML [tokens][Token]
25///
26/// To get started, see [`Source::lex`][crate::Source::lex]
27pub struct Lexer<'i> {
28    stream: Stream<'i>,
29    eof: bool,
30}
31
32impl<'i> Lexer<'i> {
33    pub(crate) fn new(input: &'i str) -> Self {
34        let mut stream = Stream::new(input);
35        if input.as_bytes().starts_with(BOM) {
36            let offset = BOM.len();
37            #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
38            unsafe {
39                stream.next_slice_unchecked(offset)
40            };
41            #[cfg(not(feature = "unsafe"))]
42            stream.next_slice(offset);
43        }
44        Lexer { stream, eof: false }
45    }
46
47    #[cfg(feature = "alloc")]
48    pub fn into_vec(self) -> Vec<Token> {
49        #![allow(unused_qualifications)] // due to MSRV of 1.66
50        let capacity = core::cmp::min(
51            self.stream.len(),
52            usize::MAX / core::mem::size_of::<Token>(),
53        );
54        let mut vec = Vec::with_capacity(capacity);
55        vec.extend(self);
56        vec
57    }
58}
59
60impl Iterator for Lexer<'_> {
61    type Item = Token;
62
63    fn next(&mut self) -> Option<Self::Item> {
64        let Some(peek_byte) = self.stream.as_bstr().first() else {
65            if self.eof {
66                return None;
67            } else {
68                self.eof = true;
69                let start = self.stream.current_token_start();
70                let span = Span::new_unchecked(start, start);
71                return Some(Token::new(TokenKind::Eof, span));
72            }
73        };
74        Some(process_token(*peek_byte, &mut self.stream))
75    }
76}
77
78const BOM: &[u8] = b"\xEF\xBB\xBF";
79
80pub(crate) type Stream<'i> = winnow::stream::LocatingSlice<&'i str>;
81
82fn process_token(peek_byte: u8, stream: &mut Stream<'_>) -> Token {
83    let token = match peek_byte {
84        b'.' => lex_ascii_char(stream, TokenKind::Dot),
85        b'=' => lex_ascii_char(stream, TokenKind::Equals),
86        b',' => lex_ascii_char(stream, TokenKind::Comma),
87        b'[' => lex_ascii_char(stream, TokenKind::LeftSquareBracket),
88        b']' => lex_ascii_char(stream, TokenKind::RightSquareBracket),
89        b'{' => lex_ascii_char(stream, TokenKind::LeftCurlyBracket),
90        b'}' => lex_ascii_char(stream, TokenKind::RightCurlyBracket),
91        b' ' => lex_whitespace(stream),
92        b'\t' => lex_whitespace(stream),
93        b'#' => lex_comment(stream),
94        b'\r' => lex_crlf(stream),
95        b'\n' => lex_ascii_char(stream, TokenKind::Newline),
96        b'\'' => {
97            if stream.starts_with(ML_LITERAL_STRING_DELIM) {
98                lex_ml_literal_string(stream)
99            } else {
100                lex_literal_string(stream)
101            }
102        }
103        b'"' => {
104            if stream.starts_with(ML_BASIC_STRING_DELIM) {
105                lex_ml_basic_string(stream)
106            } else {
107                lex_basic_string(stream)
108            }
109        }
110        _ => lex_atom(stream),
111    };
112    token
113}
114
115/// Process an ASCII character token
116///
117/// # Safety
118///
119/// - `stream` must be UTF-8
120/// - `stream` must be non-empty
121/// - `stream[0]` must be ASCII
122fn lex_ascii_char(stream: &mut Stream<'_>, kind: TokenKind) -> Token {
123    debug_assert!(!stream.is_empty());
124    let start = stream.current_token_start();
125
126    let offset = 1; // an ascii character
127    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
128    unsafe {
129        stream.next_slice_unchecked(offset)
130    };
131    #[cfg(not(feature = "unsafe"))]
132    stream.next_slice(offset);
133
134    let end = stream.previous_token_end();
135    let span = Span::new_unchecked(start, end);
136    Token::new(kind, span)
137}
138
139/// Process Whitespace
140///
141/// ```abnf
142/// ;; Whitespace
143///
144/// ws = *wschar
145/// wschar =  %x20  ; Space
146/// wschar =/ %x09  ; Horizontal tab
147/// ```
148///
149/// # Safety
150///
151/// - `stream` must be UTF-8
152/// - `stream` must be non-empty
153fn lex_whitespace(stream: &mut Stream<'_>) -> Token {
154    debug_assert!(!stream.is_empty());
155    let start = stream.current_token_start();
156
157    let offset = stream
158        .as_bstr()
159        .offset_for(|b| !WSCHAR.contains_token(b))
160        .unwrap_or(stream.eof_offset());
161    #[cfg(feature = "unsafe")] // SAFETY: WSCHAR ensures `offset` will be at UTF-8 boundary
162    unsafe {
163        stream.next_slice_unchecked(offset)
164    };
165    #[cfg(not(feature = "unsafe"))]
166    stream.next_slice(offset);
167
168    let end = stream.previous_token_end();
169    let span = Span::new_unchecked(start, end);
170    Token::new(TokenKind::Whitespace, span)
171}
172
173/// ```abnf
174/// wschar =  %x20  ; Space
175/// wschar =/ %x09  ; Horizontal tab
176/// ```
177pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');
178
179/// Process Comment
180///
181/// ```abnf
182/// ;; Comment
183///
184/// comment-start-symbol = %x23 ; #
185/// non-ascii = %x80-D7FF / %xE000-10FFFF
186/// non-eol = %x09 / %x20-7E / non-ascii
187///
188/// comment = comment-start-symbol *non-eol
189/// ```
190///
191/// # Safety
192///
193/// - `stream` must be UTF-8
194/// - `stream[0] == b'#'`
195fn lex_comment(stream: &mut Stream<'_>) -> Token {
196    let start = stream.current_token_start();
197
198    let offset = stream
199        .as_bytes()
200        .find_slice((b'\r', b'\n'))
201        .map(|s| s.start)
202        .unwrap_or_else(|| stream.eof_offset());
203    #[cfg(feature = "unsafe")] // SAFETY: newlines ensure `offset` is along UTF-8 boundary
204    unsafe {
205        stream.next_slice_unchecked(offset)
206    };
207    #[cfg(not(feature = "unsafe"))]
208    stream.next_slice(offset);
209
210    let end = stream.previous_token_end();
211    let span = Span::new_unchecked(start, end);
212    Token::new(TokenKind::Comment, span)
213}
214
215/// ```abnf
216/// comment-start-symbol = %x23 ; #
217/// ```
218pub(crate) const COMMENT_START_SYMBOL: u8 = b'#';
219
220/// Process Newline
221///
222/// ```abnf
223/// ;; Newline
224///
225/// newline =  %x0A     ; LF
226/// newline =/ %x0D.0A  ; CRLF
227/// ```
228///
229/// # Safety
230///
231/// - `stream` must be UTF-8
232/// - `stream[0] == b'\r'`
233fn lex_crlf(stream: &mut Stream<'_>) -> Token {
234    let start = stream.current_token_start();
235
236    let mut offset = '\r'.len_utf8();
237    let has_lf = stream.as_bstr().get(1) == Some(&b'\n');
238    if has_lf {
239        offset += '\n'.len_utf8();
240    }
241
242    #[cfg(feature = "unsafe")] // SAFETY: newlines ensure `offset` is along UTF-8 boundary
243    unsafe {
244        stream.next_slice_unchecked(offset)
245    };
246    #[cfg(not(feature = "unsafe"))]
247    stream.next_slice(offset);
248    let end = stream.previous_token_end();
249    let span = Span::new_unchecked(start, end);
250
251    Token::new(TokenKind::Newline, span)
252}
253
254/// Process literal string
255///
256/// ```abnf
257/// ;; Literal String
258///
259/// literal-string = apostrophe *literal-char apostrophe
260///
261/// apostrophe = %x27 ; ' apostrophe
262///
263/// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
264/// ```
265///
266/// # Safety
267///
268/// - `stream` must be UTF-8
269/// - `stream[0] == b'\''`
270fn lex_literal_string(stream: &mut Stream<'_>) -> Token {
271    let start = stream.current_token_start();
272
273    let offset = 1; // APOSTROPHE
274    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
275    unsafe {
276        stream.next_slice_unchecked(offset)
277    };
278    #[cfg(not(feature = "unsafe"))]
279    stream.next_slice(offset);
280
281    let offset = match stream.as_bstr().find_slice((APOSTROPHE, b'\n')) {
282        Some(span) => {
283            if stream.as_bstr()[span.start] == APOSTROPHE {
284                span.end
285            } else {
286                span.start
287            }
288        }
289        None => stream.eof_offset(),
290    };
291    #[cfg(feature = "unsafe")]
292    // SAFETY: `APOSTROPHE`/newline ensure `offset` is along UTF-8 boundary
293    unsafe {
294        stream.next_slice_unchecked(offset)
295    };
296    #[cfg(not(feature = "unsafe"))]
297    stream.next_slice(offset);
298
299    let end = stream.previous_token_end();
300    let span = Span::new_unchecked(start, end);
301    Token::new(TokenKind::LiteralString, span)
302}
303
304/// ```abnf
305/// apostrophe = %x27 ; ' apostrophe
306/// ```
307pub(crate) const APOSTROPHE: u8 = b'\'';
308
309/// Process multi-line literal string
310///
311/// ```abnf
312/// ;; Multiline Literal String
313///
314/// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
315///                     ml-literal-string-delim
316/// ml-literal-string-delim = 3apostrophe
317/// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
318///
319/// mll-content = literal-char / newline
320/// mll-quotes = 1*2apostrophe
321/// ```
322///
323/// # Safety
324///
325/// - `stream` must be UTF-8
326/// - `stream.starts_with(ML_LITERAL_STRING_DELIM)`
327fn lex_ml_literal_string(stream: &mut Stream<'_>) -> Token {
328    let start = stream.current_token_start();
329
330    let offset = ML_LITERAL_STRING_DELIM.len();
331    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
332    unsafe {
333        stream.next_slice_unchecked(offset)
334    };
335    #[cfg(not(feature = "unsafe"))]
336    stream.next_slice(offset);
337
338    let offset = match stream.as_bstr().find_slice(ML_LITERAL_STRING_DELIM) {
339        Some(span) => span.end,
340        None => stream.eof_offset(),
341    };
342    #[cfg(feature = "unsafe")]
343    // SAFETY: `ML_LITERAL_STRING_DELIM` ensure `offset` is along UTF-8 boundary
344    unsafe {
345        stream.next_slice_unchecked(offset)
346    };
347    #[cfg(not(feature = "unsafe"))]
348    stream.next_slice(offset);
349
350    if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
351        let offset = 1;
352        #[cfg(feature = "unsafe")] // SAFETY: `APOSTROPHE` ensure `offset` is along UTF-8 boundary
353        unsafe {
354            stream.next_slice_unchecked(offset)
355        };
356        #[cfg(not(feature = "unsafe"))]
357        stream.next_slice(offset);
358
359        if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
360            let offset = 1;
361            #[cfg(feature = "unsafe")]
362            // SAFETY: `APOSTROPHE` ensure `offset` is along UTF-8 boundary
363            unsafe {
364                stream.next_slice_unchecked(offset)
365            };
366            #[cfg(not(feature = "unsafe"))]
367            stream.next_slice(offset);
368        }
369    }
370
371    let end = stream.previous_token_end();
372    let span = Span::new_unchecked(start, end);
373    Token::new(TokenKind::MlLiteralString, span)
374}
375
376/// ```abnf
377/// ml-literal-string-delim = 3apostrophe
378/// ```
379pub(crate) const ML_LITERAL_STRING_DELIM: &str = "'''";
380
381/// Process basic string
382///
383/// ```abnf
384/// ;; Basic String
385///
386/// basic-string = quotation-mark *basic-char quotation-mark
387///
388/// quotation-mark = %x22            ; "
389///
390/// basic-char = basic-unescaped / escaped
391/// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
392/// escaped = escape escape-seq-char
393///
394/// escape = %x5C                   ; \
395/// escape-seq-char =  %x22         ; "    quotation mark  U+0022
396/// escape-seq-char =/ %x5C         ; \    reverse solidus U+005C
397/// escape-seq-char =/ %x62         ; b    backspace       U+0008
398/// escape-seq-char =/ %x65         ; e    escape          U+001B
399/// escape-seq-char =/ %x66         ; f    form feed       U+000C
400/// escape-seq-char =/ %x6E         ; n    line feed       U+000A
401/// escape-seq-char =/ %x72         ; r    carriage return U+000D
402/// escape-seq-char =/ %x74         ; t    tab             U+0009
403/// escape-seq-char =/ %x78 2HEXDIG ; xHH                  U+00HH
404/// escape-seq-char =/ %x75 4HEXDIG ; uHHHH                U+HHHH
405/// escape-seq-char =/ %x55 8HEXDIG ; UHHHHHHHH            U+HHHHHHHH
406/// ```
407///
408/// # Safety
409///
410/// - `stream` must be UTF-8
411/// - `stream[0] == b'"'`
412fn lex_basic_string(stream: &mut Stream<'_>) -> Token {
413    let start = stream.current_token_start();
414
415    let offset = 1; // QUOTATION_MARK
416    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
417    unsafe {
418        stream.next_slice_unchecked(offset)
419    };
420    #[cfg(not(feature = "unsafe"))]
421    stream.next_slice(offset);
422
423    loop {
424        // newline is present for error recovery
425        match stream.as_bstr().find_slice((QUOTATION_MARK, ESCAPE, b'\n')) {
426            Some(span) => {
427                let found = stream.as_bstr()[span.start];
428                if found == QUOTATION_MARK {
429                    let offset = span.end;
430                    #[cfg(feature = "unsafe")]
431                    // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
432                    unsafe {
433                        stream.next_slice_unchecked(offset)
434                    };
435                    #[cfg(not(feature = "unsafe"))]
436                    stream.next_slice(offset);
437                    break;
438                } else if found == ESCAPE {
439                    let offset = span.end;
440                    #[cfg(feature = "unsafe")]
441                    // SAFETY: `ESCAPE` / newline ensure `offset` is along UTF-8 boundary
442                    unsafe {
443                        stream.next_slice_unchecked(offset)
444                    };
445                    #[cfg(not(feature = "unsafe"))]
446                    stream.next_slice(offset);
447
448                    let peek = stream.as_bstr().peek_token();
449                    match peek {
450                        Some(ESCAPE) | Some(QUOTATION_MARK) => {
451                            let offset = 1; // ESCAPE / QUOTATION_MARK
452                            #[cfg(feature = "unsafe")]
453                            #[cfg(feature = "unsafe")]
454                            // SAFETY: `ESCAPE` / newline ensure `offset` is along UTF-8 boundary
455                            unsafe {
456                                stream.next_slice_unchecked(offset)
457                            };
458                            #[cfg(not(feature = "unsafe"))]
459                            stream.next_slice(offset);
460                        }
461                        _ => {}
462                    }
463                    continue;
464                } else if found == b'\n' {
465                    let offset = span.start;
466                    #[cfg(feature = "unsafe")]
467                    // SAFETY: newline ensure `offset` is along UTF-8 boundary
468                    unsafe {
469                        stream.next_slice_unchecked(offset)
470                    };
471                    #[cfg(not(feature = "unsafe"))]
472                    stream.next_slice(offset);
473                    break;
474                } else {
475                    unreachable!("found `{found}`");
476                }
477            }
478            None => {
479                stream.finish();
480                break;
481            }
482        }
483    }
484
485    let end = stream.previous_token_end();
486    let span = Span::new_unchecked(start, end);
487    Token::new(TokenKind::BasicString, span)
488}
489
490/// ```abnf
491/// quotation-mark = %x22            ; "
492/// ```
493pub(crate) const QUOTATION_MARK: u8 = b'"';
494
495/// ```abnf
496/// escape = %x5C                   ; \
497/// ```
498pub(crate) const ESCAPE: u8 = b'\\';
499
500/// Process multi-line basic string
501///
502/// ```abnf
503/// ;; Multiline Basic String
504///
505/// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
506///                   ml-basic-string-delim
507/// ml-basic-string-delim = 3quotation-mark
508/// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
509///
510/// mlb-content = basic-char / newline / mlb-escaped-nl
511/// mlb-quotes = 1*2quotation-mark
512/// mlb-escaped-nl = escape ws newline *( wschar / newline )
513/// ```
514///
515/// # Safety
516///
517/// - `stream` must be UTF-8
518/// - `stream.starts_with(ML_BASIC_STRING_DELIM)`
519fn lex_ml_basic_string(stream: &mut Stream<'_>) -> Token {
520    let start = stream.current_token_start();
521
522    let offset = ML_BASIC_STRING_DELIM.len();
523    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
524    unsafe {
525        stream.next_slice_unchecked(offset)
526    };
527    #[cfg(not(feature = "unsafe"))]
528    stream.next_slice(offset);
529
530    loop {
531        // newline is present for error recovery
532        match stream.as_bstr().find_slice((ML_BASIC_STRING_DELIM, "\\")) {
533            Some(span) => {
534                let found = stream.as_bstr()[span.start];
535                if found == QUOTATION_MARK {
536                    let offset = span.end;
537                    #[cfg(feature = "unsafe")]
538                    // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
539                    unsafe {
540                        stream.next_slice_unchecked(offset)
541                    };
542                    #[cfg(not(feature = "unsafe"))]
543                    stream.next_slice(offset);
544                    break;
545                } else if found == ESCAPE {
546                    let offset = span.end;
547                    #[cfg(feature = "unsafe")]
548                    // SAFETY: `ESCAPE` ensure `offset` is along UTF-8 boundary
549                    unsafe {
550                        stream.next_slice_unchecked(offset)
551                    };
552                    #[cfg(not(feature = "unsafe"))]
553                    stream.next_slice(offset);
554
555                    let peek = stream.as_bstr().peek_token();
556                    match peek {
557                        Some(ESCAPE) | Some(QUOTATION_MARK) => {
558                            let offset = 1; // ESCAPE / QUOTATION_MARK
559                            #[cfg(feature = "unsafe")]
560                            // SAFETY: `QUOTATION_MARK`/`ESCAPE` ensure `offset` is along UTF-8 boundary
561                            unsafe {
562                                stream.next_slice_unchecked(offset)
563                            };
564                            #[cfg(not(feature = "unsafe"))]
565                            stream.next_slice(offset);
566                        }
567                        _ => {}
568                    }
569                    continue;
570                } else {
571                    unreachable!("found `{found}`");
572                }
573            }
574            None => {
575                stream.finish();
576                break;
577            }
578        }
579    }
580    if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
581        let offset = 1;
582        #[cfg(feature = "unsafe")]
583        // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
584        unsafe {
585            stream.next_slice_unchecked(offset)
586        };
587        #[cfg(not(feature = "unsafe"))]
588        stream.next_slice(offset);
589        if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
590            let offset = 1;
591            #[cfg(feature = "unsafe")]
592            // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
593            unsafe {
594                stream.next_slice_unchecked(offset)
595            };
596            #[cfg(not(feature = "unsafe"))]
597            stream.next_slice(offset);
598        }
599    }
600
601    let end = stream.previous_token_end();
602    let span = Span::new_unchecked(start, end);
603    Token::new(TokenKind::MlBasicString, span)
604}
605
606/// ```abnf
607/// ml-basic-string-delim = 3quotation-mark
608/// ```
609pub(crate) const ML_BASIC_STRING_DELIM: &str = "\"\"\"";
610
611/// Process Atom
612///
613/// This is everything else
614///
615/// # Safety
616///
617/// - `stream` must be UTF-8
618/// - `stream` must be non-empty
619fn lex_atom(stream: &mut Stream<'_>) -> Token {
620    let start = stream.current_token_start();
621
622    // Intentionally leaves off quotes in case the opening quote was missing
623    const TOKEN_START: &[u8] = b".=,[]{} \t#\r\n";
624    let offset = stream
625        .as_bstr()
626        .offset_for(|b| TOKEN_START.contains_token(b))
627        .unwrap_or_else(|| stream.eof_offset());
628    #[cfg(feature = "unsafe")] // SAFETY: `TOKEN_START` ensure `offset` is along UTF-8 boundary
629    unsafe {
630        stream.next_slice_unchecked(offset)
631    };
632    #[cfg(not(feature = "unsafe"))]
633    stream.next_slice(offset);
634
635    let end = stream.previous_token_end();
636    let span = Span::new_unchecked(start, end);
637    Token::new(TokenKind::Atom, span)
638}