Lexer
Token
The lexer, also known as tokenizer or scanner, is responsible for transforming source text into tokens. The tokens will later be consumed by the parser so we don't have to worry about whitespaces and comments from the original text.
Let's start simple and transform a single + text into a token.
rust
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Token {
/// Token Type
pub kind: Kind,
/// Start offset in source
pub start: usize,
/// End offset in source
pub end: usize,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Kind {
Eof, // end of file
Plus,
}[ Token { kind: Kind::Plus, start: 0, end: 1 }, Token { kind: Kind::Eof, start: 1, end: 1 } ]
rust
use std::str::Chars;
struct Lexer<'a> {
/// Source Text
source: &'a str,
/// The remaining characters
chars: Chars<'a>
}
impl<'a> Lexer<'a> {
pub fn new(source: &'a str) -> Self {
Self {
source,
chars: source.chars()
}
}
}
``````rust
impl<'a> Lexer<'a> {
fn read_next_kind(&mut self) -> Kind {
while let Some(c) = self.chars.next() {
match c {
'+' => return Kind::Plus,
_ => {}
}
}
Kind::Eof
}
fn read_next_token(&mut self) -> Token {
let start = self.offset();
let kind = self.read_next_kind();
let end = self.offset();
Token { kind, start, end }
}
/// Get the length offset from the source text, in UTF-8 bytes
fn offset(&self) -> usize {
self.source.len() - self.chars.as_str().len()
}
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/str/iter.rs#L112-L115
pub fn as_str(&self) -> &'a str {
// SAFETY: `Chars` is only made from a str, which guarantees the iter is valid UTF-8.
unsafe { from_utf8_unchecked(self.iter.as_slice()) }
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/str/mod.rs#L157-L159
pub const fn len(&self) -> usize {
self.as_bytes().len()
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/str/mod.rs#L323-L325
pub const fn as_bytes(&self) -> &[u8] {
// SAFETY: const sound because we transmute two types with the same layout
unsafe { mem::transmute(self) }
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/slice/mod.rs#L129-L138
pub const fn len(&self) -> usize {
// FIXME: Replace with `crate::ptr::metadata(self)` when that is const-stable.
// As of this writing this causes a "Const-stable functions can only call other
// const-stable functions" error.
// SAFETY: Accessing the value from the `PtrRepr` union is safe since *const T
// and PtrComponents<T> have the same memory layouts. Only std can make this
// guarantee.
unsafe { crate::ptr::PtrRepr { const_ptr: self }.components.metadata }
}
``````rust
fn peek(&self) -> Option<char> {
self.chars.clone().next()
}
``````rust
// https://github.com/rust-lang/rust/blob/b998821e4c51c44a9ebee395c91323c374236bbb/library/core/src/slice/iter.rs#L148-L152
impl<T> Clone for Iter<'_, T> {
fn clone(&self) -> Self {
Iter { ptr: self.ptr, end: self.end, _marker: self._marker }
}
}
``````rust
// https://github.com/mozilla-spidermonkey/jsparagus/blob/master/crates/parser/src/lexer.rs#L1769-L1791
'+' => match self.peek() {
Some('+') => {
self.chars.next();
return self.set_result(
TerminalId::Increment,
SourceLocation::new(start, self.offset()),
TokenValue::None,
);
}
Some('=') => {
self.chars.next();
return self.set_result(
TerminalId::AddAssign,
SourceLocation::new(start, self.offset()),
TokenValue::None,
);
}
_ => return self.set_result(
TerminalId::Plus,
SourceLocation::new(start, self.offset()),
TokenValue::None,
),
},IdentifierStartChar :: UnicodeIDStart
IdentifierPartChar :: UnicodeIDContinue
UnicodeIDStart :: any Unicode code point with the Unicode property “ID_Start”
UnicodeIDContinue :: any Unicode code point with the Unicode property “ID_Continue”
rust
pub enum Kind {
Identifier,
If,
While,
For
}
``````rust
fn match_keyword(&self, ident: &str) -> Kind {
// all keywords are 1 < length <= 10
if ident.len() == 1 || ident.len() > 10 {
return Kind::Identifier;
}
match ident {
"if" => Kind::If,
"while" => Kind::While,
"for" => Kind::For,
_ => Kind::Identifier
}
}
``````rust{4-6}
pub enum Kind {
Eof, // end of file
Plus,
Identifier,
Number,
String,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct Token {
/// Token Type
pub kind: Kind,
/// Start offset in source
pub start: usize,
/// End offset in source
pub end: usize,
pub value: TokenValue,
}
#[derive(Debug, Clone, PartialEq)]
pub enum TokenValue {
None,
Number(f64),
String(String),
}Token { kind: Kind::Identifier, start: 0, end: 2, value: TokenValue::String("foo") } Token
rust
pub enum Kind {
Number(f64),
String(String),
}
``````rust
#[derive(Debug, Clone, PartialEq)]
pub enum TokenValue {
None,
Number(f64),
String(Atom),
}
```