Skip to main content

gruel_lexer/
logos_lexer.rs

1//! Logos-based lexer for the Gruel programming language.
2//!
3//! This module provides a lexer implementation using the logos derive macro
4//! for efficient tokenization.
5
6use gruel_util::{CompileError, CompileResult, ErrorKind};
7use gruel_util::{FileId, Span};
8use lasso::{Spur, ThreadedRodeo};
9use logos::Logos;
10
11/// Error type for lexing failures.
12#[derive(Debug, Clone, PartialEq, Eq, Default)]
13pub enum LexError {
14    #[default]
15    UnexpectedCharacter,
16    InvalidInteger,
17    InvalidFloat,
18    InvalidStringEscape,
19    UnterminatedString,
20    /// ADR-0071: char literal violations.
21    EmptyCharLit,
22    UnterminatedCharLit,
23    MultiCharLit,
24    InvalidCharEscape,
25    InvalidUnicodeEscape,
26}
27
28/// Process a string literal starting from an opening quote.
29/// This manually scans for the string content and closing quote,
30/// enabling detection of unterminated strings.
31fn process_string_from_quote(lex: &mut logos::Lexer<'_, LogosTokenKind>) -> Result<Spur, LexError> {
32    // At this point we've matched just the opening quote "
33    // We need to scan remainder for string content and closing quote
34    let remainder = lex.remainder();
35    let mut chars = remainder.chars();
36    let mut consumed = 0;
37    let mut result = String::new();
38    let mut found_close = false;
39
40    while let Some(c) = chars.next() {
41        if c == '"' {
42            // Found closing quote
43            consumed += 1;
44            found_close = true;
45            break;
46        } else if c == '\\' {
47            // Escape sequence
48            consumed += c.len_utf8();
49            match chars.next() {
50                Some('\\') => {
51                    consumed += 1;
52                    result.push('\\');
53                }
54                Some('"') => {
55                    consumed += 1;
56                    result.push('"');
57                }
58                Some('n') => {
59                    consumed += 1;
60                    result.push('\n');
61                }
62                Some('t') => {
63                    consumed += 1;
64                    result.push('\t');
65                }
66                Some('r') => {
67                    consumed += 1;
68                    result.push('\r');
69                }
70                Some('0') => {
71                    consumed += 1;
72                    result.push('\0');
73                }
74                Some(other) => {
75                    // Invalid escape - consume the char to get better error position
76                    consumed += other.len_utf8();
77                    lex.bump(consumed);
78                    return Err(LexError::InvalidStringEscape);
79                }
80                None => {
81                    // Backslash at end of input
82                    lex.bump(consumed);
83                    return Err(LexError::UnterminatedString);
84                }
85            }
86        } else if c == '\n' {
87            // Newline in string - string is unterminated at this line
88            // Don't consume the newline so error span points to string start
89            lex.bump(consumed);
90            return Err(LexError::UnterminatedString);
91        } else {
92            consumed += c.len_utf8();
93            result.push(c);
94        }
95    }
96
97    if !found_close {
98        // Reached end of input without closing quote
99        lex.bump(consumed);
100        return Err(LexError::UnterminatedString);
101    }
102
103    // Advance past the string content and closing quote
104    lex.bump(consumed);
105
106    // Intern the string
107    let spur = lex.extras.get_or_intern(&result);
108    Ok(spur)
109}
110
111/// Process a char literal starting from an opening single-quote.
112/// ADR-0071: emits a u32 Unicode scalar value, with thorough error
113/// reporting for empty/multi-char/unterminated/invalid escapes.
114fn process_char_from_quote(lex: &mut logos::Lexer<'_, LogosTokenKind>) -> Result<u32, LexError> {
115    // At entry we've matched just the opening '
116    let remainder = lex.remainder();
117    let mut chars = remainder.chars();
118    let mut consumed: usize = 0;
119
120    // Read the body (one Unicode scalar OR an escape sequence).
121    let scalar: u32 = match chars.next() {
122        None => {
123            return Err(LexError::UnterminatedCharLit);
124        }
125        Some('\'') => {
126            // Empty literal ''
127            lex.bump(1);
128            return Err(LexError::EmptyCharLit);
129        }
130        Some('\n') | Some('\r') => {
131            // Newline/CR before close — unterminated. Don't consume
132            // so the error span is meaningful.
133            return Err(LexError::UnterminatedCharLit);
134        }
135        Some('\\') => {
136            consumed += 1;
137            match chars.next() {
138                None => {
139                    lex.bump(consumed);
140                    return Err(LexError::UnterminatedCharLit);
141                }
142                Some('n') => {
143                    consumed += 1;
144                    '\n' as u32
145                }
146                Some('r') => {
147                    consumed += 1;
148                    '\r' as u32
149                }
150                Some('t') => {
151                    consumed += 1;
152                    '\t' as u32
153                }
154                Some('\\') => {
155                    consumed += 1;
156                    '\\' as u32
157                }
158                Some('\'') => {
159                    consumed += 1;
160                    '\'' as u32
161                }
162                Some('"') => {
163                    consumed += 1;
164                    '"' as u32
165                }
166                Some('0') => {
167                    consumed += 1;
168                    0u32
169                }
170                Some('u') => {
171                    consumed += 1;
172                    // Expect '{' next
173                    match chars.next() {
174                        Some('{') => {
175                            consumed += 1;
176                        }
177                        _ => {
178                            lex.bump(consumed);
179                            return Err(LexError::InvalidUnicodeEscape);
180                        }
181                    }
182                    // Read up to 6 hex digits, then expect '}'
183                    let mut hex = String::new();
184                    let mut closed = false;
185                    while let Some(c) = chars.next() {
186                        consumed += c.len_utf8();
187                        if c == '}' {
188                            closed = true;
189                            break;
190                        }
191                        if hex.len() >= 6 {
192                            lex.bump(consumed);
193                            return Err(LexError::InvalidUnicodeEscape);
194                        }
195                        if c.is_ascii_hexdigit() {
196                            hex.push(c);
197                        } else {
198                            lex.bump(consumed);
199                            return Err(LexError::InvalidUnicodeEscape);
200                        }
201                    }
202                    if !closed || hex.is_empty() {
203                        lex.bump(consumed);
204                        return Err(LexError::InvalidUnicodeEscape);
205                    }
206                    let n = u32::from_str_radix(&hex, 16)
207                        .map_err(|_| LexError::InvalidUnicodeEscape)?;
208                    // Validate scalar value range.
209                    if (0xD800..=0xDFFF).contains(&n) || n > 0x10FFFF {
210                        lex.bump(consumed);
211                        return Err(LexError::InvalidUnicodeEscape);
212                    }
213                    n
214                }
215                Some(_) => {
216                    lex.bump(consumed);
217                    return Err(LexError::InvalidCharEscape);
218                }
219            }
220        }
221        Some(c) => {
222            consumed += c.len_utf8();
223            c as u32
224        }
225    };
226
227    // Now expect the closing '.
228    match chars.next() {
229        Some('\'') => {
230            consumed += 1;
231            lex.bump(consumed);
232            Ok(scalar)
233        }
234        Some('\n') | Some('\r') | None => {
235            lex.bump(consumed);
236            Err(LexError::UnterminatedCharLit)
237        }
238        Some(_) => {
239            // Multi-character literal: consume until closing ' or newline.
240            // We already have at least one extra char.
241            // Don't bother counting precisely — bump past to end of literal.
242            // This produces a sharper error if the user has 'ab' or 'abc'.
243            // We must still advance past the rest of the literal so subsequent
244            // tokens scan correctly.
245            //
246            // For 'ab', the second char's len was already consumed via chars.next();
247            // we only added the first char's len. Recover: we need to count what
248            // we just read plus walk forward until ' or newline.
249            // Simplest: bump what we have, then keep eating chars until a closing '.
250            // Track that consumed chars.
251            // Re-advance: the second char ate chars.next() above; capture its len.
252            // (We discarded the value but not the bytes.)
253            // Restart from scratch with a fresh remainder slice:
254            let new_remainder = &lex.remainder()[consumed..];
255            let mut extra = 0;
256            let mut found_close = false;
257            for c in new_remainder.chars() {
258                if c == '\n' || c == '\r' {
259                    break;
260                }
261                extra += c.len_utf8();
262                if c == '\'' {
263                    found_close = true;
264                    break;
265                }
266            }
267            lex.bump(consumed + extra);
268            if found_close {
269                Err(LexError::MultiCharLit)
270            } else {
271                Err(LexError::UnterminatedCharLit)
272            }
273        }
274    }
275}
276
277/// Callback for `///` line doc comments (ADR-0089).
278///
279/// On entry, the lexer has matched the three slashes only. We consume
280/// the rest of the line (up to but not including the terminating `\n`),
281/// strip a single leading space if present (matching Rust), intern the
282/// resulting body text, and return the interned `Spur`. The `////…`
283/// (four-or-more slash) case is handled by a separate skip pattern with
284/// higher priority — also matching Rust.
285fn process_line_doc(lex: &mut logos::Lexer<'_, LogosTokenKind>) -> Spur {
286    let remainder = lex.remainder();
287    let line_end = remainder.find('\n').unwrap_or(remainder.len());
288    let body = &remainder[..line_end];
289    let stripped = body.strip_prefix(' ').unwrap_or(body);
290    let spur = lex.extras.get_or_intern(stripped);
291    lex.bump(line_end);
292    spur
293}
294
295/// Token kinds in the Gruel language, using logos derive macro.
296#[derive(Logos, Debug, Clone, PartialEq, Eq)]
297#[logos(error = LexError)]
298#[logos(extras = ThreadedRodeo)]
299#[logos(skip r"[ \t\n\r\f]+")]
300// `////` (four or more slashes) is a plain comment, matching Rust.
301#[logos(skip r"////+[^\n]*")]
302// Regular line comments — `//` not followed by another slash, or a bare `//`.
303#[logos(skip r"//[^/\n][^\n]*")]
304#[logos(skip r"//")]
305pub enum LogosTokenKind {
306    /// ADR-0089: `///` line doc comment. The associated `Spur` is the
307    /// interned line body with the `///` marker and at most one leading
308    /// space removed.
309    #[token("///", process_line_doc)]
310    LineDoc(Spur),
311
312    // Keywords - logos prefers longer/specific matches over shorter/generic ones
313    #[token("fn")]
314    Fn,
315    #[token("let")]
316    Let,
317    #[token("mut")]
318    Mut,
319    #[token("if")]
320    If,
321    #[token("else")]
322    Else,
323    #[token("match")]
324    Match,
325    #[token("while")]
326    While,
327    #[token("for")]
328    For,
329    #[token("in")]
330    In,
331    #[token("loop")]
332    Loop,
333    #[token("break")]
334    Break,
335    #[token("continue")]
336    Continue,
337    #[token("return")]
338    Return,
339    #[token("true")]
340    True,
341    #[token("false")]
342    False,
343    #[token("struct")]
344    Struct,
345    #[token("enum")]
346    Enum,
347    #[token("interface")]
348    Interface,
349    #[token("self")]
350    SelfValue,
351    #[token("Self")]
352    SelfType,
353    #[token("comptime_unroll")]
354    ComptimeUnroll,
355    #[token("comptime")]
356    Comptime,
357    #[token("derive")]
358    Derive,
359    #[token("pub")]
360    Pub,
361    #[token("const")]
362    Const,
363    #[token("checked")]
364    Checked,
365    /// ADR-0085: introduces an extern-declaration block linked against the
366    /// named library. Recognised as a keyword unconditionally; sema gates
367    /// usage behind the `c_ffi` preview feature.
368    #[token("link_extern")]
369    LinkExtern,
370    /// ADR-0086: `static_link_extern("foo") { … }` — sibling to
371    /// `link_extern` that requests static linkage.
372    #[token("static_link_extern")]
373    StaticLinkExtern,
374
375    // Type keywords
376    #[token("i8")]
377    I8,
378    #[token("i16")]
379    I16,
380    #[token("i32")]
381    I32,
382    #[token("i64")]
383    I64,
384    #[token("isize")]
385    Isize,
386    #[token("u8")]
387    U8,
388    #[token("u16")]
389    U16,
390    #[token("u32")]
391    U32,
392    #[token("u64")]
393    U64,
394    #[token("usize")]
395    Usize,
396    #[token("f16")]
397    F16,
398    #[token("f32")]
399    F32,
400    #[token("f64")]
401    F64,
402    #[token("bool")]
403    Bool,
404    #[token("char")]
405    Char,
406
407    // Patterns
408    #[token("_")]
409    Underscore,
410
411    // Floating-point literals (must appear before Int so 42.0 matches as float, not int + dot + int)
412    // Matches: 3.14, 1.0e10, 2.5E-3, 1e10, 1E+5
413    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?", |lex| {
414        lex.slice().parse::<f64>().map(|v| v.to_bits()).map_err(|_| LexError::InvalidFloat)
415    })]
416    #[regex(r"[0-9]+[eE][+-]?[0-9]+", |lex| {
417        lex.slice().parse::<f64>().map(|v| v.to_bits()).map_err(|_| LexError::InvalidFloat)
418    })]
419    Float(u64),
420
421    // Integer literals
422    #[regex(r"[0-9]+", |lex| lex.slice().parse::<u64>().map_err(|_| LexError::InvalidInteger))]
423    Int(u64),
424
425    // String literals - match opening quote and process content manually
426    // This allows detection of unterminated strings
427    #[token("\"", process_string_from_quote)]
428    String(Spur),
429
430    // Char literals (ADR-0071) - opening single-quote triggers the scanner.
431    #[token("'", process_char_from_quote)]
432    CharLit(u32),
433
434    // Identifiers (lower priority than keywords)
435    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.extras.get_or_intern(lex.slice()), priority = 1)]
436    Ident(Spur),
437
438    // Multi-character operators (logos automatically prefers longer matches)
439    #[token("==")]
440    EqEq,
441    #[token("!=")]
442    BangEq,
443    #[token("<=")]
444    LtEq,
445    #[token(">=")]
446    GtEq,
447    #[token("&&")]
448    AmpAmp,
449    #[token("||")]
450    PipePipe,
451    #[token("<<")]
452    LtLt,
453    #[token(">>")]
454    GtGt,
455    #[token("->")]
456    Arrow,
457    #[token("=>")]
458    FatArrow,
459    #[token("::")]
460    ColonColon,
461
462    // Single-character operators
463    #[token("+")]
464    Plus,
465    #[token("-")]
466    Minus,
467    #[token("*")]
468    Star,
469    #[token("/")]
470    Slash,
471    #[token("%")]
472    Percent,
473    #[token("=")]
474    Eq,
475    #[token("!")]
476    Bang,
477    #[token("<")]
478    Lt,
479    #[token(">")]
480    Gt,
481    #[token("&")]
482    Amp,
483    #[token("|")]
484    Pipe,
485    #[token("^")]
486    Caret,
487    #[token("~")]
488    Tilde,
489
490    // Punctuation
491    #[token("(")]
492    LParen,
493    #[token(")")]
494    RParen,
495    #[token("{")]
496    LBrace,
497    #[token("}")]
498    RBrace,
499    #[token("[")]
500    LBracket,
501    #[token("]")]
502    RBracket,
503    #[token(":")]
504    Colon,
505    #[token(";")]
506    Semi,
507    #[token(",")]
508    Comma,
509    #[token(".")]
510    Dot,
511    #[token("@")]
512    At,
513
514    // Builtins - use callback to ensure @import is not followed by identifier chars
515    // This prevents @importx from being tokenized as @import + x
516    #[token("@import", at_import_callback)]
517    AtImport,
518}
519
520/// Callback for @import token to ensure word boundary.
521/// Returns Some(()) if @import is NOT followed by identifier chars, None otherwise.
522fn at_import_callback(lex: &mut logos::Lexer<'_, LogosTokenKind>) -> Option<()> {
523    match lex.remainder().chars().next() {
524        Some(c) if c.is_ascii_alphanumeric() || c == '_' => None,
525        _ => Some(()),
526    }
527}
528
529use crate::{Token, TokenKind};
530
531impl From<LogosTokenKind> for TokenKind {
532    fn from(logos_kind: LogosTokenKind) -> Self {
533        match logos_kind {
534            LogosTokenKind::Fn => TokenKind::Fn,
535            LogosTokenKind::Let => TokenKind::Let,
536            LogosTokenKind::Mut => TokenKind::Mut,
537            LogosTokenKind::If => TokenKind::If,
538            LogosTokenKind::Else => TokenKind::Else,
539            LogosTokenKind::Match => TokenKind::Match,
540            LogosTokenKind::While => TokenKind::While,
541            LogosTokenKind::For => TokenKind::For,
542            LogosTokenKind::In => TokenKind::In,
543            LogosTokenKind::Loop => TokenKind::Loop,
544            LogosTokenKind::Break => TokenKind::Break,
545            LogosTokenKind::Continue => TokenKind::Continue,
546            LogosTokenKind::Return => TokenKind::Return,
547            LogosTokenKind::True => TokenKind::True,
548            LogosTokenKind::False => TokenKind::False,
549            LogosTokenKind::Struct => TokenKind::Struct,
550            LogosTokenKind::Enum => TokenKind::Enum,
551            LogosTokenKind::Interface => TokenKind::Interface,
552            LogosTokenKind::SelfValue => TokenKind::SelfValue,
553            LogosTokenKind::SelfType => TokenKind::SelfType,
554            LogosTokenKind::ComptimeUnroll => TokenKind::ComptimeUnroll,
555            LogosTokenKind::Comptime => TokenKind::Comptime,
556            LogosTokenKind::Derive => TokenKind::Derive,
557            LogosTokenKind::Pub => TokenKind::Pub,
558            LogosTokenKind::Const => TokenKind::Const,
559            LogosTokenKind::Checked => TokenKind::Checked,
560            LogosTokenKind::LinkExtern => TokenKind::LinkExtern,
561            LogosTokenKind::StaticLinkExtern => TokenKind::StaticLinkExtern,
562            LogosTokenKind::I8 => TokenKind::I8,
563            LogosTokenKind::I16 => TokenKind::I16,
564            LogosTokenKind::I32 => TokenKind::I32,
565            LogosTokenKind::I64 => TokenKind::I64,
566            LogosTokenKind::Isize => TokenKind::Isize,
567            LogosTokenKind::U8 => TokenKind::U8,
568            LogosTokenKind::U16 => TokenKind::U16,
569            LogosTokenKind::U32 => TokenKind::U32,
570            LogosTokenKind::U64 => TokenKind::U64,
571            LogosTokenKind::Usize => TokenKind::Usize,
572            LogosTokenKind::F16 => TokenKind::F16,
573            LogosTokenKind::F32 => TokenKind::F32,
574            LogosTokenKind::F64 => TokenKind::F64,
575            LogosTokenKind::Bool => TokenKind::Bool,
576            LogosTokenKind::Char => TokenKind::Char,
577            LogosTokenKind::Float(bits) => TokenKind::Float(bits),
578            LogosTokenKind::Underscore => TokenKind::Underscore,
579            LogosTokenKind::Int(n) => TokenKind::Int(n),
580            LogosTokenKind::String(s) => TokenKind::String(s),
581            LogosTokenKind::CharLit(c) => TokenKind::CharLit(c),
582            LogosTokenKind::Ident(s) => TokenKind::Ident(s),
583            LogosTokenKind::EqEq => TokenKind::EqEq,
584            LogosTokenKind::BangEq => TokenKind::BangEq,
585            LogosTokenKind::LtEq => TokenKind::LtEq,
586            LogosTokenKind::GtEq => TokenKind::GtEq,
587            LogosTokenKind::AmpAmp => TokenKind::AmpAmp,
588            LogosTokenKind::PipePipe => TokenKind::PipePipe,
589            LogosTokenKind::LtLt => TokenKind::LtLt,
590            LogosTokenKind::GtGt => TokenKind::GtGt,
591            LogosTokenKind::Arrow => TokenKind::Arrow,
592            LogosTokenKind::FatArrow => TokenKind::FatArrow,
593            LogosTokenKind::ColonColon => TokenKind::ColonColon,
594            LogosTokenKind::Plus => TokenKind::Plus,
595            LogosTokenKind::Minus => TokenKind::Minus,
596            LogosTokenKind::Star => TokenKind::Star,
597            LogosTokenKind::Slash => TokenKind::Slash,
598            LogosTokenKind::Percent => TokenKind::Percent,
599            LogosTokenKind::Eq => TokenKind::Eq,
600            LogosTokenKind::Bang => TokenKind::Bang,
601            LogosTokenKind::Lt => TokenKind::Lt,
602            LogosTokenKind::Gt => TokenKind::Gt,
603            LogosTokenKind::Amp => TokenKind::Amp,
604            LogosTokenKind::Pipe => TokenKind::Pipe,
605            LogosTokenKind::Caret => TokenKind::Caret,
606            LogosTokenKind::Tilde => TokenKind::Tilde,
607            LogosTokenKind::LParen => TokenKind::LParen,
608            LogosTokenKind::RParen => TokenKind::RParen,
609            LogosTokenKind::LBrace => TokenKind::LBrace,
610            LogosTokenKind::RBrace => TokenKind::RBrace,
611            LogosTokenKind::LBracket => TokenKind::LBracket,
612            LogosTokenKind::RBracket => TokenKind::RBracket,
613            LogosTokenKind::Colon => TokenKind::Colon,
614            LogosTokenKind::Semi => TokenKind::Semi,
615            LogosTokenKind::Comma => TokenKind::Comma,
616            LogosTokenKind::Dot => TokenKind::Dot,
617            LogosTokenKind::At => TokenKind::At,
618            // AtImport is handled specially in tokenize() to provide the interned "import" Spur
619            LogosTokenKind::AtImport => unreachable!("AtImport should be handled specially"),
620            LogosTokenKind::LineDoc(s) => TokenKind::LineDoc(s),
621        }
622    }
623}
624
625/// Logos-based lexer that converts source text into tokens.
626pub struct LogosLexer<'a> {
627    source: &'a str,
628    interner: ThreadedRodeo,
629    file_id: FileId,
630}
631
632impl<'a> LogosLexer<'a> {
633    /// Create a new lexer for the given source text with a fresh interner.
634    ///
635    /// Uses the default file ID. For multi-file compilation, use `with_file_id`.
636    pub fn new(source: &'a str) -> Self {
637        Self {
638            source,
639            interner: ThreadedRodeo::default(),
640            file_id: FileId::DEFAULT,
641        }
642    }
643
644    /// Create a new lexer with an existing interner.
645    pub fn with_interner(source: &'a str, interner: ThreadedRodeo) -> Self {
646        Self {
647            source,
648            interner,
649            file_id: FileId::DEFAULT,
650        }
651    }
652
653    /// Create a new lexer with a specific file ID.
654    pub fn with_file_id(source: &'a str, file_id: FileId) -> Self {
655        Self {
656            source,
657            interner: ThreadedRodeo::default(),
658            file_id,
659        }
660    }
661
662    /// Create a new lexer with both an existing interner and a specific file ID.
663    pub fn with_interner_and_file_id(
664        source: &'a str,
665        interner: ThreadedRodeo,
666        file_id: FileId,
667    ) -> Self {
668        Self {
669            source,
670            interner,
671            file_id,
672        }
673    }
674
675    /// Tokenize the entire source, returning all tokens and the interner.
676    pub fn tokenize(self) -> CompileResult<(Vec<Token>, ThreadedRodeo)> {
677        // Estimate capacity: source length / 4 is a rough heuristic for token density
678        let mut tokens = Vec::with_capacity(self.source.len() / 4);
679
680        let mut lexer = LogosTokenKind::lexer_with_extras(self.source, self.interner);
681
682        while let Some(result) = lexer.next() {
683            let span = lexer.span();
684            match result {
685                Ok(logos_kind) => {
686                    // Convert LogosTokenKind to TokenKind, handling @import specially
687                    // because it needs to carry the interned "import" symbol
688                    let token_kind = if matches!(logos_kind, LogosTokenKind::AtImport) {
689                        let import_spur = lexer.extras.get_or_intern("import");
690                        TokenKind::AtImport(import_spur)
691                    } else {
692                        logos_kind.into()
693                    };
694                    tokens.push(Token {
695                        kind: token_kind,
696                        span: Span::with_file(self.file_id, span.start as u32, span.end as u32),
697                    });
698                }
699                Err(lex_error) => {
700                    let gruel_util =
701                        Span::with_file(self.file_id, span.start as u32, span.end as u32);
702                    let slice = lexer.slice();
703                    let error_char = slice.chars().next().unwrap_or('?');
704                    let kind = match lex_error {
705                        LexError::InvalidInteger => ErrorKind::InvalidInteger,
706                        LexError::InvalidFloat => ErrorKind::InvalidFloat,
707                        LexError::UnexpectedCharacter => ErrorKind::UnexpectedCharacter(error_char),
708                        LexError::InvalidStringEscape => {
709                            // Find the escape character after backslash
710                            let escape_char = slice
711                                .find('\\')
712                                .and_then(|pos| slice[pos + 1..].chars().next())
713                                .unwrap_or('?');
714                            ErrorKind::InvalidStringEscape(escape_char)
715                        }
716                        LexError::UnterminatedString => ErrorKind::UnterminatedString,
717                        LexError::EmptyCharLit => ErrorKind::EmptyCharLit,
718                        LexError::UnterminatedCharLit => ErrorKind::UnterminatedCharLit,
719                        LexError::MultiCharLit => ErrorKind::MultiCharLit,
720                        LexError::InvalidCharEscape => ErrorKind::InvalidCharEscape,
721                        LexError::InvalidUnicodeEscape => ErrorKind::InvalidUnicodeEscape,
722                    };
723                    return Err(CompileError::new(kind, gruel_util));
724                }
725            }
726        }
727
728        // Add EOF token (logos doesn't emit EOF)
729        let eof_pos = self.source.len() as u32;
730        tokens.push(Token {
731            kind: TokenKind::Eof,
732            span: Span::point_in_file(self.file_id, eof_pos),
733        });
734
735        // Extract the interner from the logos lexer
736        let interner = lexer.extras;
737
738        Ok((tokens, interner))
739    }
740}
741
742#[cfg(test)]
743mod tests {
744    use super::*;
745
746    /// Helper to get the string for a symbol from the interner.
747    fn get_ident_str<'a>(kind: &TokenKind, interner: &'a ThreadedRodeo) -> Option<&'a str> {
748        match kind {
749            TokenKind::Ident(sym) => Some(interner.resolve(sym)),
750            _ => None,
751        }
752    }
753
754    /// Helper to get the string for a string literal symbol.
755    fn get_string_str<'a>(kind: &TokenKind, interner: &'a ThreadedRodeo) -> Option<&'a str> {
756        match kind {
757            TokenKind::String(sym) => Some(interner.resolve(sym)),
758            _ => None,
759        }
760    }
761
762    #[test]
763    fn test_logos_basic_tokens() {
764        let lexer = LogosLexer::new("fn main() -> i32 { 42 }");
765        let (tokens, interner) = lexer.tokenize().unwrap();
766
767        assert!(matches!(tokens[0].kind, TokenKind::Fn));
768        assert_eq!(get_ident_str(&tokens[1].kind, &interner), Some("main"));
769        assert!(matches!(tokens[2].kind, TokenKind::LParen));
770        assert!(matches!(tokens[3].kind, TokenKind::RParen));
771        assert!(matches!(tokens[4].kind, TokenKind::Arrow));
772        assert!(matches!(tokens[5].kind, TokenKind::I32));
773        assert!(matches!(tokens[6].kind, TokenKind::LBrace));
774        assert!(matches!(tokens[7].kind, TokenKind::Int(42)));
775        assert!(matches!(tokens[8].kind, TokenKind::RBrace));
776        assert!(matches!(tokens[9].kind, TokenKind::Eof));
777    }
778
779    #[test]
780    fn test_logos_unexpected_character() {
781        let lexer = LogosLexer::new("fn main() { $ }");
782        let result = lexer.tokenize();
783        assert!(result.is_err());
784        let err = result.unwrap_err();
785        assert!(matches!(err.kind, ErrorKind::UnexpectedCharacter('$')));
786    }
787
788    #[test]
789    fn test_logos_at_token() {
790        let lexer = LogosLexer::new("@dbg");
791        let (tokens, interner) = lexer.tokenize().unwrap();
792        assert!(matches!(tokens[0].kind, TokenKind::At));
793        assert_eq!(get_ident_str(&tokens[1].kind, &interner), Some("dbg"));
794    }
795
796    #[test]
797    fn test_logos_at_import_token() {
798        // @import should be recognized as a single token with interned "import" Spur
799        let lexer = LogosLexer::new("@import");
800        let (tokens, interner) = lexer.tokenize().unwrap();
801        if let TokenKind::AtImport(spur) = tokens[0].kind {
802            assert_eq!(interner.resolve(&spur), "import");
803        } else {
804            panic!("Expected AtImport token");
805        }
806        assert!(matches!(tokens[1].kind, TokenKind::Eof));
807    }
808
809    #[test]
810    fn test_logos_at_import_vs_at_other() {
811        // @import as single token vs @other (At + Ident)
812        let lexer = LogosLexer::new("@import @other");
813        let (tokens, interner) = lexer.tokenize().unwrap();
814        assert!(matches!(tokens[0].kind, TokenKind::AtImport(_)));
815        assert!(matches!(tokens[1].kind, TokenKind::At));
816        assert_eq!(get_ident_str(&tokens[2].kind, &interner), Some("other"));
817    }
818
819    #[test]
820    fn test_logos_at_import_span() {
821        // Verify the span covers the entire @import token
822        let lexer = LogosLexer::new("@import");
823        let (tokens, _) = lexer.tokenize().unwrap();
824        assert_eq!(tokens[0].span, Span::new(0, 7)); // "@import" is 7 chars
825    }
826
827    #[test]
828    fn test_logos_at_import_with_parens() {
829        // @import("path.gruel") pattern
830        let lexer = LogosLexer::new(r#"@import("math.gruel")"#);
831        let (tokens, interner) = lexer.tokenize().unwrap();
832        assert!(matches!(tokens[0].kind, TokenKind::AtImport(_)));
833        assert!(matches!(tokens[1].kind, TokenKind::LParen));
834        assert_eq!(
835            get_string_str(&tokens[2].kind, &interner),
836            Some("math.gruel")
837        );
838        assert!(matches!(tokens[3].kind, TokenKind::RParen));
839    }
840
841    #[test]
842    fn test_logos_at_import_suffix_is_error() {
843        // @importx is an invalid token - @import followed by x cannot be a valid construct
844        // The lexer produces an error because @import matches but is followed by 'x'
845        // which makes it an invalid token sequence
846        let lexer = LogosLexer::new("@importx");
847        let result = lexer.tokenize();
848        // This should error because @importx is neither @import nor @ followed by a space
849        assert!(result.is_err());
850    }
851
852    #[test]
853    fn test_logos_spans() {
854        let lexer = LogosLexer::new("fn main");
855        let (tokens, _interner) = lexer.tokenize().unwrap();
856
857        assert_eq!(tokens[0].span, Span::new(0, 2)); // "fn"
858        assert_eq!(tokens[1].span, Span::new(3, 7)); // "main"
859    }
860
861    #[test]
862    fn test_logos_arithmetic_operators() {
863        let lexer = LogosLexer::new("1 + 2 - 3 * 4 / 5 % 6");
864        let (tokens, _interner) = lexer.tokenize().unwrap();
865
866        assert!(matches!(tokens[0].kind, TokenKind::Int(1)));
867        assert!(matches!(tokens[1].kind, TokenKind::Plus));
868        assert!(matches!(tokens[2].kind, TokenKind::Int(2)));
869        assert!(matches!(tokens[3].kind, TokenKind::Minus));
870        assert!(matches!(tokens[4].kind, TokenKind::Int(3)));
871        assert!(matches!(tokens[5].kind, TokenKind::Star));
872        assert!(matches!(tokens[6].kind, TokenKind::Int(4)));
873        assert!(matches!(tokens[7].kind, TokenKind::Slash));
874        assert!(matches!(tokens[8].kind, TokenKind::Int(5)));
875        assert!(matches!(tokens[9].kind, TokenKind::Percent));
876        assert!(matches!(tokens[10].kind, TokenKind::Int(6)));
877    }
878
879    #[test]
880    fn test_logos_minus_vs_arrow() {
881        // Minus alone
882        let lexer = LogosLexer::new("a - b");
883        let (tokens, _) = lexer.tokenize().unwrap();
884        assert!(matches!(tokens[1].kind, TokenKind::Minus));
885
886        // Arrow
887        let lexer = LogosLexer::new("-> i32");
888        let (tokens, _) = lexer.tokenize().unwrap();
889        assert!(matches!(tokens[0].kind, TokenKind::Arrow));
890
891        // Minus followed by non-arrow
892        let lexer = LogosLexer::new("-1");
893        let (tokens, _) = lexer.tokenize().unwrap();
894        assert!(matches!(tokens[0].kind, TokenKind::Minus));
895        assert!(matches!(tokens[1].kind, TokenKind::Int(1)));
896    }
897
898    #[test]
899    fn test_logos_let_binding() {
900        let lexer = LogosLexer::new("let x = 42;");
901        let (tokens, interner) = lexer.tokenize().unwrap();
902
903        assert!(matches!(tokens[0].kind, TokenKind::Let));
904        assert_eq!(get_ident_str(&tokens[1].kind, &interner), Some("x"));
905        assert!(matches!(tokens[2].kind, TokenKind::Eq));
906        assert!(matches!(tokens[3].kind, TokenKind::Int(42)));
907        assert!(matches!(tokens[4].kind, TokenKind::Semi));
908    }
909
910    #[test]
911    fn test_logos_logical_operators() {
912        let lexer = LogosLexer::new("!true && false || true");
913        let (tokens, _) = lexer.tokenize().unwrap();
914
915        assert!(matches!(tokens[0].kind, TokenKind::Bang));
916        assert!(matches!(tokens[1].kind, TokenKind::True));
917        assert!(matches!(tokens[2].kind, TokenKind::AmpAmp));
918        assert!(matches!(tokens[3].kind, TokenKind::False));
919        assert!(matches!(tokens[4].kind, TokenKind::PipePipe));
920        assert!(matches!(tokens[5].kind, TokenKind::True));
921    }
922
923    #[test]
924    fn test_logos_comparison_operators() {
925        let lexer = LogosLexer::new("a == b != c < d > e <= f >= g");
926        let (tokens, _) = lexer.tokenize().unwrap();
927
928        assert!(matches!(tokens[1].kind, TokenKind::EqEq));
929        assert!(matches!(tokens[3].kind, TokenKind::BangEq));
930        assert!(matches!(tokens[5].kind, TokenKind::Lt));
931        assert!(matches!(tokens[7].kind, TokenKind::Gt));
932        assert!(matches!(tokens[9].kind, TokenKind::LtEq));
933        assert!(matches!(tokens[11].kind, TokenKind::GtEq));
934    }
935
936    #[test]
937    fn test_logos_line_comments() {
938        let lexer = LogosLexer::new("fn // comment\nmain");
939        let (tokens, interner) = lexer.tokenize().unwrap();
940
941        assert!(matches!(tokens[0].kind, TokenKind::Fn));
942        assert_eq!(get_ident_str(&tokens[1].kind, &interner), Some("main"));
943        assert!(matches!(tokens[2].kind, TokenKind::Eof));
944    }
945
946    #[test]
947    fn test_logos_keywords_vs_identifiers() {
948        // Keywords should be recognized
949        let lexer = LogosLexer::new("fn let mut if else while break continue true false");
950        let (tokens, _) = lexer.tokenize().unwrap();
951
952        assert!(matches!(tokens[0].kind, TokenKind::Fn));
953        assert!(matches!(tokens[1].kind, TokenKind::Let));
954        assert!(matches!(tokens[2].kind, TokenKind::Mut));
955        assert!(matches!(tokens[3].kind, TokenKind::If));
956        assert!(matches!(tokens[4].kind, TokenKind::Else));
957        assert!(matches!(tokens[5].kind, TokenKind::While));
958        assert!(matches!(tokens[6].kind, TokenKind::Break));
959        assert!(matches!(tokens[7].kind, TokenKind::Continue));
960        assert!(matches!(tokens[8].kind, TokenKind::True));
961        assert!(matches!(tokens[9].kind, TokenKind::False));
962
963        // Identifiers that start with keywords should be identifiers
964        let lexer = LogosLexer::new("fns lets mutable iff elseif whileloop");
965        let (tokens, interner) = lexer.tokenize().unwrap();
966
967        assert_eq!(get_ident_str(&tokens[0].kind, &interner), Some("fns"));
968        assert_eq!(get_ident_str(&tokens[1].kind, &interner), Some("lets"));
969        assert_eq!(get_ident_str(&tokens[2].kind, &interner), Some("mutable"));
970        assert_eq!(get_ident_str(&tokens[3].kind, &interner), Some("iff"));
971        assert_eq!(get_ident_str(&tokens[4].kind, &interner), Some("elseif"));
972        assert_eq!(get_ident_str(&tokens[5].kind, &interner), Some("whileloop"));
973    }
974
975    #[test]
976    fn test_logos_bitwise_operators() {
977        let lexer = LogosLexer::new("a & b | c ^ d ~ e << f >> g");
978        let (tokens, interner) = lexer.tokenize().unwrap();
979
980        assert_eq!(get_ident_str(&tokens[0].kind, &interner), Some("a"));
981        assert!(matches!(tokens[1].kind, TokenKind::Amp));
982        assert_eq!(get_ident_str(&tokens[2].kind, &interner), Some("b"));
983        assert!(matches!(tokens[3].kind, TokenKind::Pipe));
984        assert_eq!(get_ident_str(&tokens[4].kind, &interner), Some("c"));
985        assert!(matches!(tokens[5].kind, TokenKind::Caret));
986        assert_eq!(get_ident_str(&tokens[6].kind, &interner), Some("d"));
987        assert!(matches!(tokens[7].kind, TokenKind::Tilde));
988        assert_eq!(get_ident_str(&tokens[8].kind, &interner), Some("e"));
989        assert!(matches!(tokens[9].kind, TokenKind::LtLt));
990        assert_eq!(get_ident_str(&tokens[10].kind, &interner), Some("f"));
991        assert!(matches!(tokens[11].kind, TokenKind::GtGt));
992        assert_eq!(get_ident_str(&tokens[12].kind, &interner), Some("g"));
993    }
994
995    #[test]
996    fn test_logos_bitwise_vs_logical() {
997        // Single & should be bitwise AND
998        let lexer = LogosLexer::new("a & b");
999        let (tokens, _) = lexer.tokenize().unwrap();
1000        assert!(matches!(tokens[1].kind, TokenKind::Amp));
1001
1002        // Double && should be logical AND
1003        let lexer = LogosLexer::new("a && b");
1004        let (tokens, _) = lexer.tokenize().unwrap();
1005        assert!(matches!(tokens[1].kind, TokenKind::AmpAmp));
1006
1007        // Single | should be bitwise OR
1008        let lexer = LogosLexer::new("a | b");
1009        let (tokens, _) = lexer.tokenize().unwrap();
1010        assert!(matches!(tokens[1].kind, TokenKind::Pipe));
1011
1012        // Double || should be logical OR
1013        let lexer = LogosLexer::new("a || b");
1014        let (tokens, _) = lexer.tokenize().unwrap();
1015        assert!(matches!(tokens[1].kind, TokenKind::PipePipe));
1016    }
1017
1018    #[test]
1019    fn test_logos_shift_vs_comparison() {
1020        // << should be left shift
1021        let lexer = LogosLexer::new("a << b");
1022        let (tokens, _) = lexer.tokenize().unwrap();
1023        assert!(matches!(tokens[1].kind, TokenKind::LtLt));
1024
1025        // >> should be right shift
1026        let lexer = LogosLexer::new("a >> b");
1027        let (tokens, _) = lexer.tokenize().unwrap();
1028        assert!(matches!(tokens[1].kind, TokenKind::GtGt));
1029
1030        // < should be less than
1031        let lexer = LogosLexer::new("a < b");
1032        let (tokens, _) = lexer.tokenize().unwrap();
1033        assert!(matches!(tokens[1].kind, TokenKind::Lt));
1034
1035        // > should be greater than
1036        let lexer = LogosLexer::new("a > b");
1037        let (tokens, _) = lexer.tokenize().unwrap();
1038        assert!(matches!(tokens[1].kind, TokenKind::Gt));
1039
1040        // <= should be less than or equal
1041        let lexer = LogosLexer::new("a <= b");
1042        let (tokens, _) = lexer.tokenize().unwrap();
1043        assert!(matches!(tokens[1].kind, TokenKind::LtEq));
1044
1045        // >= should be greater than or equal
1046        let lexer = LogosLexer::new("a >= b");
1047        let (tokens, _) = lexer.tokenize().unwrap();
1048        assert!(matches!(tokens[1].kind, TokenKind::GtEq));
1049    }
1050
1051    #[test]
1052    fn test_logos_integer_overflow() {
1053        // A number too large for u64 should produce InvalidInteger error
1054        let lexer = LogosLexer::new("99999999999999999999999");
1055        let result = lexer.tokenize();
1056        assert!(result.is_err());
1057        let err = result.unwrap_err();
1058        assert!(matches!(err.kind, ErrorKind::InvalidInteger));
1059    }
1060
1061    #[test]
1062    fn test_logos_type_keywords() {
1063        // Type names should be recognized as keywords, not identifiers
1064        let lexer = LogosLexer::new("i8 i16 i32 i64 u8 u16 u32 u64 bool");
1065        let (tokens, _) = lexer.tokenize().unwrap();
1066
1067        assert!(matches!(tokens[0].kind, TokenKind::I8));
1068        assert!(matches!(tokens[1].kind, TokenKind::I16));
1069        assert!(matches!(tokens[2].kind, TokenKind::I32));
1070        assert!(matches!(tokens[3].kind, TokenKind::I64));
1071        assert!(matches!(tokens[4].kind, TokenKind::U8));
1072        assert!(matches!(tokens[5].kind, TokenKind::U16));
1073        assert!(matches!(tokens[6].kind, TokenKind::U32));
1074        assert!(matches!(tokens[7].kind, TokenKind::U64));
1075        assert!(matches!(tokens[8].kind, TokenKind::Bool));
1076
1077        // Identifiers that start with type names should be identifiers
1078        let lexer = LogosLexer::new("i32x i64ptr boolish u8_data");
1079        let (tokens, interner) = lexer.tokenize().unwrap();
1080
1081        assert_eq!(get_ident_str(&tokens[0].kind, &interner), Some("i32x"));
1082        assert_eq!(get_ident_str(&tokens[1].kind, &interner), Some("i64ptr"));
1083        assert_eq!(get_ident_str(&tokens[2].kind, &interner), Some("boolish"));
1084        assert_eq!(get_ident_str(&tokens[3].kind, &interner), Some("u8_data"));
1085    }
1086
1087    #[test]
1088    fn test_logos_unterminated_string() {
1089        // String without closing quote at end of input
1090        let lexer = LogosLexer::new(r#""hello"#);
1091        let result = lexer.tokenize();
1092        assert!(result.is_err());
1093        let err = result.unwrap_err();
1094        assert!(matches!(err.kind, ErrorKind::UnterminatedString));
1095
1096        // String without closing quote followed by newline
1097        let lexer = LogosLexer::new("\"hello\nworld");
1098        let result = lexer.tokenize();
1099        assert!(result.is_err());
1100        let err = result.unwrap_err();
1101        assert!(matches!(err.kind, ErrorKind::UnterminatedString));
1102
1103        // Just an opening quote
1104        let lexer = LogosLexer::new("\"");
1105        let result = lexer.tokenize();
1106        assert!(result.is_err());
1107        let err = result.unwrap_err();
1108        assert!(matches!(err.kind, ErrorKind::UnterminatedString));
1109    }
1110
1111    #[test]
1112    fn test_logos_valid_strings() {
1113        // Valid complete string
1114        let lexer = LogosLexer::new(r#""hello""#);
1115        let (tokens, interner) = lexer.tokenize().unwrap();
1116        assert_eq!(get_string_str(&tokens[0].kind, &interner), Some("hello"));
1117
1118        // Empty string
1119        let lexer = LogosLexer::new(r#""""#);
1120        let (tokens, interner) = lexer.tokenize().unwrap();
1121        assert_eq!(get_string_str(&tokens[0].kind, &interner), Some(""));
1122
1123        // String with escaped quote
1124        let lexer = LogosLexer::new(r#""hello\"world""#);
1125        let (tokens, interner) = lexer.tokenize().unwrap();
1126        assert_eq!(
1127            get_string_str(&tokens[0].kind, &interner),
1128            Some("hello\"world")
1129        );
1130
1131        // String with escaped backslash
1132        let lexer = LogosLexer::new(r#""hello\\world""#);
1133        let (tokens, interner) = lexer.tokenize().unwrap();
1134        assert_eq!(
1135            get_string_str(&tokens[0].kind, &interner),
1136            Some("hello\\world")
1137        );
1138    }
1139
1140    #[test]
1141    fn test_logos_escape_newline() {
1142        let lexer = LogosLexer::new(r#""line1\nline2""#);
1143        let (tokens, interner) = lexer.tokenize().unwrap();
1144        assert_eq!(
1145            get_string_str(&tokens[0].kind, &interner),
1146            Some("line1\nline2")
1147        );
1148    }
1149
1150    #[test]
1151    fn test_logos_escape_tab() {
1152        let lexer = LogosLexer::new(r#""col1\tcol2""#);
1153        let (tokens, interner) = lexer.tokenize().unwrap();
1154        assert_eq!(
1155            get_string_str(&tokens[0].kind, &interner),
1156            Some("col1\tcol2")
1157        );
1158    }
1159
1160    #[test]
1161    fn test_logos_escape_carriage_return() {
1162        let lexer = LogosLexer::new(r#""line\r\n""#);
1163        let (tokens, interner) = lexer.tokenize().unwrap();
1164        assert_eq!(get_string_str(&tokens[0].kind, &interner), Some("line\r\n"));
1165    }
1166
1167    #[test]
1168    fn test_logos_escape_null() {
1169        let lexer = LogosLexer::new(r#""null\0byte""#);
1170        let (tokens, interner) = lexer.tokenize().unwrap();
1171        assert_eq!(
1172            get_string_str(&tokens[0].kind, &interner),
1173            Some("null\0byte")
1174        );
1175    }
1176
1177    #[test]
1178    fn test_logos_invalid_escape_q() {
1179        let lexer = LogosLexer::new(r#""bad\qescape""#);
1180        let result = lexer.tokenize();
1181        assert!(result.is_err());
1182        let err = result.unwrap_err();
1183        assert!(matches!(err.kind, ErrorKind::InvalidStringEscape('q')));
1184    }
1185
1186    #[test]
1187    fn test_logos_all_escapes_combined() {
1188        // Test all escape sequences in one string
1189        let lexer = LogosLexer::new(r#""\\\"abc\n\t\r\0xyz""#);
1190        let (tokens, interner) = lexer.tokenize().unwrap();
1191        assert_eq!(
1192            get_string_str(&tokens[0].kind, &interner),
1193            Some("\\\"abc\n\t\r\0xyz")
1194        );
1195    }
1196
1197    #[test]
1198    fn test_interning_deduplicates() {
1199        // Same identifier appearing multiple times should have same Symbol
1200        let lexer = LogosLexer::new("x x x");
1201        let (tokens, _interner) = lexer.tokenize().unwrap();
1202
1203        let sym0 = match &tokens[0].kind {
1204            TokenKind::Ident(s) => *s,
1205            _ => panic!("expected Ident"),
1206        };
1207        let sym1 = match &tokens[1].kind {
1208            TokenKind::Ident(s) => *s,
1209            _ => panic!("expected Ident"),
1210        };
1211        let sym2 = match &tokens[2].kind {
1212            TokenKind::Ident(s) => *s,
1213            _ => panic!("expected Ident"),
1214        };
1215
1216        assert_eq!(sym0, sym1);
1217        assert_eq!(sym1, sym2);
1218    }
1219
1220    /// Helper to read a `LineDoc` body string.
1221    fn get_line_doc<'a>(kind: &TokenKind, interner: &'a ThreadedRodeo) -> Option<&'a str> {
1222        match kind {
1223            TokenKind::LineDoc(sym) => Some(interner.resolve(sym)),
1224            _ => None,
1225        }
1226    }
1227
1228    #[test]
1229    fn test_line_doc_basic() {
1230        // `/// x` → LineDoc("x") (one leading space stripped)
1231        let lexer = LogosLexer::new("/// hello\nfn main() {}");
1232        let (tokens, interner) = lexer.tokenize().unwrap();
1233        assert_eq!(get_line_doc(&tokens[0].kind, &interner), Some("hello"));
1234        assert!(matches!(tokens[1].kind, TokenKind::Fn));
1235    }
1236
1237    #[test]
1238    fn test_line_doc_no_space() {
1239        // `///x` → LineDoc("x") (no leading space to strip)
1240        let lexer = LogosLexer::new("///x\n");
1241        let (tokens, interner) = lexer.tokenize().unwrap();
1242        assert_eq!(get_line_doc(&tokens[0].kind, &interner), Some("x"));
1243    }
1244
1245    #[test]
1246    fn test_line_doc_empty() {
1247        // `///` alone (no body) → LineDoc("")
1248        let lexer = LogosLexer::new("///\n");
1249        let (tokens, interner) = lexer.tokenize().unwrap();
1250        assert_eq!(get_line_doc(&tokens[0].kind, &interner), Some(""));
1251    }
1252
1253    #[test]
1254    fn test_line_doc_strips_one_space_only() {
1255        // `///  hello` → LineDoc(" hello") — only ONE leading space stripped
1256        let lexer = LogosLexer::new("///  hello\n");
1257        let (tokens, interner) = lexer.tokenize().unwrap();
1258        assert_eq!(get_line_doc(&tokens[0].kind, &interner), Some(" hello"));
1259    }
1260
1261    #[test]
1262    fn test_four_slashes_is_plain_comment() {
1263        // `////` (4+ slashes) is a plain comment, matching Rust — skipped.
1264        let lexer = LogosLexer::new("//// not a doc\nfn main() {}");
1265        let (tokens, _interner) = lexer.tokenize().unwrap();
1266        assert!(matches!(tokens[0].kind, TokenKind::Fn));
1267    }
1268
1269    #[test]
1270    fn test_five_slashes_is_plain_comment() {
1271        let lexer = LogosLexer::new("///// also plain\nfn main() {}");
1272        let (tokens, _interner) = lexer.tokenize().unwrap();
1273        assert!(matches!(tokens[0].kind, TokenKind::Fn));
1274    }
1275
1276    #[test]
1277    fn test_line_doc_bang_in_body() {
1278        // `///!` is a doc comment with body "!" — not an inner-doc form
1279        // (we have no `//!` token).
1280        let lexer = LogosLexer::new("///!boom\n");
1281        let (tokens, interner) = lexer.tokenize().unwrap();
1282        assert_eq!(get_line_doc(&tokens[0].kind, &interner), Some("!boom"));
1283    }
1284
1285    #[test]
1286    fn test_regular_double_slash_skipped() {
1287        // Plain `//` is still skipped, as before.
1288        let lexer = LogosLexer::new("// just a comment\nfn main() {}");
1289        let (tokens, _interner) = lexer.tokenize().unwrap();
1290        assert!(matches!(tokens[0].kind, TokenKind::Fn));
1291    }
1292
1293    #[test]
1294    fn test_empty_double_slash_skipped() {
1295        // Lone `//` followed by EOL is also a plain comment.
1296        let lexer = LogosLexer::new("//\nfn main() {}");
1297        let (tokens, _interner) = lexer.tokenize().unwrap();
1298        assert!(matches!(tokens[0].kind, TokenKind::Fn));
1299    }
1300
1301    #[test]
1302    fn test_consecutive_line_docs() {
1303        // Each `///` line emits its own LineDoc token; the parser groups runs.
1304        let lexer = LogosLexer::new("/// line 1\n/// line 2\nfn main() {}");
1305        let (tokens, interner) = lexer.tokenize().unwrap();
1306        assert_eq!(get_line_doc(&tokens[0].kind, &interner), Some("line 1"));
1307        assert_eq!(get_line_doc(&tokens[1].kind, &interner), Some("line 2"));
1308        assert!(matches!(tokens[2].kind, TokenKind::Fn));
1309    }
1310
1311    #[test]
1312    fn test_line_doc_span_excludes_newline() {
1313        let lexer = LogosLexer::new("/// hi\n");
1314        let (tokens, _interner) = lexer.tokenize().unwrap();
1315        // "/// hi" is 6 bytes
1316        assert_eq!(tokens[0].span.start, 0);
1317        assert_eq!(tokens[0].span.end, 6);
1318    }
1319
1320    #[test]
1321    fn test_line_doc_at_eof_no_newline() {
1322        // A `///x` at end of input (no trailing newline) is still recognized.
1323        let lexer = LogosLexer::new("/// tail");
1324        let (tokens, interner) = lexer.tokenize().unwrap();
1325        assert_eq!(get_line_doc(&tokens[0].kind, &interner), Some("tail"));
1326    }
1327
1328    #[test]
1329    fn test_token_kind_is_copy() {
1330        // This test ensures TokenKind is Copy by using it in a context that requires Copy
1331        let lexer = LogosLexer::new("x");
1332        let (tokens, _) = lexer.tokenize().unwrap();
1333        let kind = tokens[0].kind; // This would fail if TokenKind weren't Copy
1334        let _kind2 = kind; // Use both without moving
1335        let _kind3 = kind;
1336    }
1337}