lexer: initial lexer

The initial lexer for the monkey language. We only support a small subset at this stage. We have some simple tests to ensure that we can parse some small snippet, and that the minimum number of tokens we need are also all supported correctly.
author: franck cuny <franck@fcuny.net> 2020-01-11 13:34:54 +0100
committer: franck cuny <franck@fcuny.net> 2020-01-11 13:55:16 +0100
commit: d7b15313dc554871cc0973924323f7407722121a (patch)
tree: e78190942639de4673c5f64ed51aed95eb839cb2 /users/fcuny/exp/monkey/pkg/lexer/lexer.go
parent: token: initial tokenizer. (diff)
download: world-d7b15313dc554871cc0973924323f7407722121a.tar.gz
1 files changed, 114 insertions, 0 deletions
diff --git a/users/fcuny/exp/monkey/pkg/lexer/lexer.go b/users/fcuny/exp/monkey/pkg/lexer/lexer.go
new file mode 100644
index 0000000..fc29371
--- /dev/null
+++ b/users/fcuny/exp/monkey/pkg/lexer/lexer.go
@@ -0,0 +1,114 @@
+package lexer
+
+import "monkey/pkg/token"
+
+// Lexer represents the lexer
+type Lexer struct {
+	input string
+	// current position in input
+	position int
+	// current reading position in input (after a char)
+	readPosition int
+	// current character under examination
+	ch byte
+}
+
+// New returns a new lexer
+func New(input string) *Lexer {
+	l := &Lexer{input: input}
+	l.readChar()
+	return l
+}
+
+// Read the current character and advances our position in the input string.
+func (l *Lexer) readChar() {
+	// if we've reached the end of the input, we set the current character to 0,
+	// which is the ASCII code for NUL.
+	if l.readPosition >= len(l.input) {
+		l.ch = 0
+	} else {
+		l.ch = l.input[l.readPosition]
+	}
+	l.position = l.readPosition
+	l.readPosition++
+}
+
+func (l *Lexer) readIdentifier() string {
+	position := l.position
+	for isLetter(l.ch) {
+		l.readChar()
+	}
+	return l.input[position:l.position]
+}
+
+func (l *Lexer) readNumber() string {
+	position := l.position
+	for isDigit(l.ch) {
+		l.readChar()
+	}
+	return l.input[position:l.position]
+}
+
+// we don't care about white space characters, we skip them when we find them.
+func (l *Lexer) skipWhitespace() {
+	for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
+		l.readChar()
+	}
+}
+
+// NextToken reads the next token from the lexer and returns the current token.
+func (l *Lexer) NextToken() token.Token {
+	var tok token.Token
+
+	l.skipWhitespace()
+
+	switch l.ch {
+	case '=':
+		tok = newToken(token.ASSIGN, l.ch)
+	case '+':
+		tok = newToken(token.PLUS, l.ch)
+	case ';':
+		tok = newToken(token.SEMICOLON, l.ch)
+	case ',':
+		tok = newToken(token.COMMA, l.ch)
+	case '(':
+		tok = newToken(token.LPAREN, l.ch)
+	case ')':
+		tok = newToken(token.RPAREN, l.ch)
+	case '{':
+		tok = newToken(token.LBRACE, l.ch)
+	case '}':
+		tok = newToken(token.RBRACE, l.ch)
+	case 0:
+		tok.Literal = ""
+		tok.Type = token.EOF
+	default:
+		if isLetter(l.ch) {
+			tok.Literal = l.readIdentifier()
+			tok.Type = token.LookupIdent(tok.Literal)
+			return tok
+		} else if isDigit(l.ch) {
+			tok.Type = token.INT
+			tok.Literal = l.readNumber()
+			return tok
+		} else {
+			tok = newToken(token.ILLEGAL, l.ch)
+		}
+
+	}
+
+	l.readChar()
+	return tok
+}
+
+func newToken(tokenType token.TokenType, ch byte) token.Token {
+	return token.Token{Type: tokenType, Literal: string(ch)}
+}
+
+func isLetter(ch byte) bool {
+	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
+}
+
+func isDigit(ch byte) bool {
+	return '0' <= ch && ch <= '9'
+}
author	franck cuny <franck@fcuny.net>	2020-01-11 13:34:54 +0100
committer	franck cuny <franck@fcuny.net>	2020-01-11 13:55:16 +0100
commit	d7b15313dc554871cc0973924323f7407722121a (patch)
tree	e78190942639de4673c5f64ed51aed95eb839cb2 /users/fcuny/exp/monkey/pkg/lexer/lexer.go
parent	token: initial tokenizer. (diff)
download	world-d7b15313dc554871cc0973924323f7407722121a.tar.gz