about summary refs log tree commit diff
path: root/users/fcuny/exp/monkey/pkg/lexer/lexer.go
diff options
context:
space:
mode:
authorfranck cuny <franck@fcuny.net>2020-01-11 13:34:54 +0100
committerfranck cuny <franck@fcuny.net>2020-01-11 13:55:16 +0100
commitd7b15313dc554871cc0973924323f7407722121a (patch)
treee78190942639de4673c5f64ed51aed95eb839cb2 /users/fcuny/exp/monkey/pkg/lexer/lexer.go
parenttoken: initial tokenizer. (diff)
downloadworld-d7b15313dc554871cc0973924323f7407722121a.tar.gz
lexer: initial lexer
The initial lexer for the monkey language. We only support a small
subset at this stage.

We have some simple tests to ensure that we can parse some small
snippet, and that the minimum number of tokens we need are also all
supported correctly.
Diffstat (limited to 'users/fcuny/exp/monkey/pkg/lexer/lexer.go')
-rw-r--r--users/fcuny/exp/monkey/pkg/lexer/lexer.go114
1 files changed, 114 insertions, 0 deletions
diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer.go b/users/fcuny/exp/monkey/pkg/lexer/lexer.go
new file mode 100644
index 0000000..fc29371
--- /dev/null
+++ b/users/fcuny/exp/monkey/pkg/lexer/lexer.go
@@ -0,0 +1,114 @@
+package lexer
+
+import "monkey/pkg/token"
+
+// Lexer represents the lexer
+type Lexer struct {
+	input string
+	// current position in input
+	position int
+	// current reading position in input (after a char)
+	readPosition int
+	// current character under examination
+	ch byte
+}
+
+// New returns a new lexer
+func New(input string) *Lexer {
+	l := &Lexer{input: input}
+	l.readChar()
+	return l
+}
+
+// Read the current character and advances our position in the input string.
+func (l *Lexer) readChar() {
+	// if we've reached the end of the input, we set the current character to 0,
+	// which is the ASCII code for NUL.
+	if l.readPosition >= len(l.input) {
+		l.ch = 0
+	} else {
+		l.ch = l.input[l.readPosition]
+	}
+	l.position = l.readPosition
+	l.readPosition++
+}
+
+func (l *Lexer) readIdentifier() string {
+	position := l.position
+	for isLetter(l.ch) {
+		l.readChar()
+	}
+	return l.input[position:l.position]
+}
+
+func (l *Lexer) readNumber() string {
+	position := l.position
+	for isDigit(l.ch) {
+		l.readChar()
+	}
+	return l.input[position:l.position]
+}
+
+// we don't care about white space characters, we skip them when we find them.
+func (l *Lexer) skipWhitespace() {
+	for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
+		l.readChar()
+	}
+}
+
+// NextToken reads the next token from the lexer and returns the current token.
+func (l *Lexer) NextToken() token.Token {
+	var tok token.Token
+
+	l.skipWhitespace()
+
+	switch l.ch {
+	case '=':
+		tok = newToken(token.ASSIGN, l.ch)
+	case '+':
+		tok = newToken(token.PLUS, l.ch)
+	case ';':
+		tok = newToken(token.SEMICOLON, l.ch)
+	case ',':
+		tok = newToken(token.COMMA, l.ch)
+	case '(':
+		tok = newToken(token.LPAREN, l.ch)
+	case ')':
+		tok = newToken(token.RPAREN, l.ch)
+	case '{':
+		tok = newToken(token.LBRACE, l.ch)
+	case '}':
+		tok = newToken(token.RBRACE, l.ch)
+	case 0:
+		tok.Literal = ""
+		tok.Type = token.EOF
+	default:
+		if isLetter(l.ch) {
+			tok.Literal = l.readIdentifier()
+			tok.Type = token.LookupIdent(tok.Literal)
+			return tok
+		} else if isDigit(l.ch) {
+			tok.Type = token.INT
+			tok.Literal = l.readNumber()
+			return tok
+		} else {
+			tok = newToken(token.ILLEGAL, l.ch)
+		}
+
+	}
+
+	l.readChar()
+	return tok
+}
+
+func newToken(tokenType token.TokenType, ch byte) token.Token {
+	return token.Token{Type: tokenType, Literal: string(ch)}
+}
+
+func isLetter(ch byte) bool {
+	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
+}
+
+func isDigit(ch byte) bool {
+	return '0' <= ch && ch <= '9'
+}