Skip to content

Lexer

Token

The lexer, also known as tokenizer or scanner, is responsible for transforming source text into tokens. The tokens will later be consumed by the parser so we don't have to worry about whitespaces and comments from the original text.

Let's start simple and transform a single + text into a token.

rust
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Token {
    /// Token Type
    pub kind: Kind,

    /// Start offset in source
    pub start: usize,

    /// End offset in source
    pub end: usize,
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Kind {
    Eof, // end of file
    Plus,
}

[ Token { kind: Kind::Plus, start: 0, end: 1 }, Token { kind: Kind::Eof, start: 1, end: 1 } ]

rust
use std::str::Chars;

struct Lexer<'a> {
    /// Source Text
    source: &'a str,

    /// The remaining characters
    chars: Chars<'a>
}

impl<'a> Lexer<'a> {
    pub fn new(source: &'a str) -> Self {
        Self {
            source,
            chars: source.chars()
        }
    }
}
``````rust
impl<'a> Lexer<'a> {
    fn read_next_kind(&mut self) -> Kind {
        while let Some(c) = self.chars.next() {
            match c {
              '+' => return Kind::Plus,
              _ => {}
            }
        }
        Kind::Eof
    }

    fn read_next_token(&mut self) -> Token {
        let start = self.offset();
        let kind = self.read_next_kind();
        let end = self.offset();
        Token { kind, start, end }
    }

    /// Get the length offset from the source text, in UTF-8 bytes
    fn offset(&self) -> usize {
        self.source.len() - self.chars.as_str().len()
    }
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/str/iter.rs#L112-L115

pub fn as_str(&self) -> &'a str {
    // SAFETY: `Chars` is only made from a str, which guarantees the iter is valid UTF-8.
    unsafe { from_utf8_unchecked(self.iter.as_slice()) }
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/str/mod.rs#L157-L159

pub const fn len(&self) -> usize {
    self.as_bytes().len()
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/str/mod.rs#L323-L325

pub const fn as_bytes(&self) -> &[u8] {
    // SAFETY: const sound because we transmute two types with the same layout
    unsafe { mem::transmute(self) }
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/slice/mod.rs#L129-L138

pub const fn len(&self) -> usize {
    // FIXME: Replace with `crate::ptr::metadata(self)` when that is const-stable.
    // As of this writing this causes a "Const-stable functions can only call other
    // const-stable functions" error.

    // SAFETY: Accessing the value from the `PtrRepr` union is safe since *const T
    // and PtrComponents<T> have the same memory layouts. Only std can make this
    // guarantee.
    unsafe { crate::ptr::PtrRepr { const_ptr: self }.components.metadata }
}
``````rust
fn peek(&self) -> Option<char> {
    self.chars.clone().next()
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/slice/iter.rs#L148-L152

impl<T> Clone for Iter<'_, T> {
    fn clone(&self) -> Self {
        Iter { ptr: self.ptr, end: self.end, _marker: self._marker }
    }
}
``````rust
// https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/lexer.rs#L1769-L1791

'+' => match self.peek() {
    Some('+') => {
        self.chars.next();
        return self.set_result(
            TerminalId::Increment,
            SourceLocation::new(start, self.offset()),
            TokenValue::None,
        );
    }
    Some('=') => {
        self.chars.next();
        return self.set_result(
            TerminalId::AddAssign,
            SourceLocation::new(start, self.offset()),
            TokenValue::None,
        );
    }
    _ => return self.set_result(
        TerminalId::Plus,
        SourceLocation::new(start, self.offset()),
        TokenValue::None,
    ),
},

IdentifierStartChar :: UnicodeIDStart

IdentifierPartChar :: UnicodeIDContinue

UnicodeIDStart :: any Unicode code point with the Unicode property “ID_Start”

UnicodeIDContinue :: any Unicode code point with the Unicode property “ID_Continue”

rust
pub enum Kind {
    Identifier,
    If,
    While,
    For
}
``````rust
fn match_keyword(&self, ident: &str) -> Kind {
    // all keywords are 1 < length <= 10
    if ident.len() == 1 || ident.len() > 10 {
        return Kind::Identifier;
    }
    match ident {
        "if" => Kind::If,
        "while" => Kind::While,
        "for" => Kind::For,
        _ => Kind::Identifier
    }
}
``````rust{4-6}
pub enum Kind {
    Eof, // end of file
    Plus,
    Identifier,
    Number,
    String,
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Token {
    /// Token Type
    pub kind: Kind,

    /// Start offset in source
    pub start: usize,

    /// End offset in source
    pub end: usize,

    pub value: TokenValue,
}

#[derive(Debug, Clone, PartialEq)]
pub enum TokenValue {
    None,
    Number(f64),
    String(String),
}

Token { kind: Kind::Identifier, start: 0, end: 2, value: TokenValue::String("foo") } Token

rust
pub enum Kind {
    Number(f64),
    String(String),
}
``````rust
#[derive(Debug, Clone, PartialEq)]
pub enum TokenValue {
    None,
    Number(f64),
    String(Atom), 
}
```