参照《Writing An Interpreter In Go》,改用C++实现。
项目源码: https://github.com/LeslieZhu/monkey-cpp
引言
本篇对应的源码位于目录: src/01/
src
|01
| |token
| | |token.hpp
| |CMakeLists.txt
| |test
| | |main.cpp
| | |lexer_test.cpp
| |lexer
| | |lexer.hpp
| |repl
| | |repl.hpp
将源代码转换为词法单元,这个过程叫词法分析
,完成这个过程的叫词法分析器(tokenizer/scanner)
。
定义词法单元Token
namespace token
{
using TokenType = std::string;
struct Token
{
TokenType Type;
std::string Literal;
};
}
用一个结构体来存储Token,它包含类型和源码内容。
用常量来标记所有的Token类型:
namespace token
{
namespace types
{
const TokenType ILLEGAL = "ILLEGAL";
const TokenType EndOF = "EOF";
// Identifiers + literals
const TokenType IDENT = "IDENT"; // add, foobar, x, y, ...
const TokenType INT = "INT"; // 1343456
// Operators
const TokenType ASSIGN = "=";
const TokenType PLUS = "+";
const TokenType MINUS = "-";
const TokenType BANG = "!";
const TokenType ASTERISK = "*";
const TokenType SLASH = "/";
const TokenType LT = "<";
const TokenType GT = ">";
const TokenType EQ = "==";
const TokenType NOT_EQ = "!=";
// Delimiters
const TokenType COMMA = ",";
const TokenType SEMICOLON = ";";
const TokenType LPAREN = "(";
const TokenType RPAREN = ")";
const TokenType LBRACE = "{";
const TokenType RBRACE = "}";
// Keywords
const TokenType FUNCTION = "FUNCTION";
const TokenType LET = "LET";
const TokenType TRUE = "TRUE";
const TokenType FALSE = "FALSE";
const TokenType IF = "IF";
const TokenType ELSE = "ELSE";
const TokenType RETURN = "RETURN";
}
}
对于暂不支持的词法类型统一设置为ILLEGAL
,源码文件结束则使用EndOF
, 而对于关键字则用一个字典来管理:
std::map<std::string, TokenType> keywords = {{"fn", token::types::FUNCTION},
{"let", token::types::LET},
{"true", token::types::TRUE},
{"false", token::types::FALSE},
{"if", token::types::IF},
{"else", token::types::ELSE},
{"return", token::types::RETURN}};
词法分析器Lexer
namespace lexer
{
struct Lexer
{
std::string input;
int position; // current position in input (points to current char)
int readPosition; // current reading position in input (after current char)
char ch; // current char under examination
Lexer(const std::string input_) : input(input_),
position(0),
readPosition(0),
ch(' ')
{
}
};
}
用结构体Lexer来存储读取的字符、当期位置、下一个读取位置等信息。
同时提供一个创建词法分析器对象的函数:
namespace lexer
{
std::unique_ptr<Lexer> New(std::string input)
{
std::unique_ptr<Lexer> l = std::make_unique<Lexer>(input);
l->readChar();
return l;
}
}
一旦Lexer创建后,它不断读取源代码并识别成词法单元Token:
token::Token NextToken()
{
token::Token tok;
skipWhitespace();
switch (ch)
{
case '=':
{
if (peekChar() == '=')
{
char oldChar = ch;
readChar();
std::string literal = std::string(1, oldChar) + std::string(1, ch);
tok.Type = token::types::EQ;
tok.Literal = literal;
}
else
{
tok = newToken(token::types::ASSIGN, ch);
}
}
break;
case '+':
{
tok = newToken(token::types::PLUS, ch);
}
break;
case '-':
{
tok = newToken(token::types::MINUS, ch);
}
break;
case '!':
{
if (peekChar() == '=')
{
char oldChar = ch;
readChar();
std::string literal = std::string(1, oldChar) + std::string(1, ch);
tok.Type = token::types::NOT_EQ;
tok.Literal = literal;
}
else
{
tok = newToken(token::types::BANG, ch);
}
}
break;
case '/':
tok = newToken(token::types::SLASH, ch);
break;
case '*':
tok = newToken(token::types::ASTERISK, ch);
break;
case '<':
tok = newToken(token::types::LT, ch);
break;
case '>':
tok = newToken(token::types::GT, ch);
break;
case ';':
tok = newToken(token::types::SEMICOLON, ch);
break;
case ',':
tok = newToken(token::types::COMMA, ch);
break;
case '{':
tok = newToken(token::types::LBRACE, ch);
break;
case '}':
tok = newToken(token::types::RBRACE, ch);
break;
case '(':
tok = newToken(token::types::LPAREN, ch);
break;
case ')':
tok = newToken(token::types::RPAREN, ch);
break;
case 0:
{
tok.Literal = "";
tok.Type = token::types::EndOF;
}
break;
default:
{
if (isLetter(ch))
{
tok.Literal = readIdentifier();
tok.Type = token::LookupIdent(tok.Literal);
return tok;
}
else if (isDigit(ch))
{
tok.Literal = readNumber();
tok.Type = token::types::INT;
return tok;
}
else
{
tok = newToken(token::types::ILLEGAL, ch);
}
}
}
readChar();
return tok;
}
测试用例
TEST(TestNextToken, BasicAssertions)
{
std::string input = R""(
leT five = 5;
)"";
std::vector<token::Token> tests{{token::types::LET, "let"},
{token::types::IDENT, "five"},
{token::types::ASSIGN, "="},
{token::types::INT, "5"},
{token::types::SEMICOLON, ";"},
{token::types::EndOF, ""}};
auto lexer = lexer::New(input);
for (const auto &test : tests)
{
token::Token tok = lexer->NextToken();
EXPECT_EQ(tok.Type, test.Type);
EXPECT_EQ(tok.Literal, test.Literal);
}
}
测试不通过:
$ ./monkey_test
~/waiicpp/src/01/test/lexer_test.cpp:111: Failure
Expected equality of these values:
tok.Type
Which is: "IDENT"
test.Type
Which is: "LET"
~/waiicpp/src/01/test/lexer_test.cpp:112: Failure
Expected equality of these values:
tok.Literal
Which is: "leT"
test.Literal
Which is: "let"
[ FAILED ] TestNextToken.BasicAssertions (0 ms)
[----------] 1 test from TestNextToken (0 ms total)
[----------] Global test environment tear-down
[==========] 1 test from 1 test suite ran. (0 ms total)
[ PASSED ] 0 tests.
[ FAILED ] 1 test, listed below:
[ FAILED ] TestNextToken.BasicAssertions
1 FAILED TEST
将leT
改成let
修正后,测试通过:
$ ./monkey_test
[==========] Running 1 test from 1 test suite.
[----------] Global test environment set-up.
[----------] 1 test from TestNextToken
[ RUN ] TestNextToken.BasicAssertions
[ OK ] TestNextToken.BasicAssertions (0 ms)
[----------] 1 test from TestNextToken (0 ms total)
[----------] Global test environment tear-down
[==========] 1 test from 1 test suite ran. (0 ms total)
[ PASSED ] 1 test.
REPL
这个REPL接收一个输入然后打印Token解析结果:
namespace repl
{
const static std::string PROMPT = ">> ";
void Start()
{
std::string line;
while (true)
{
std::cout << PROMPT;
getline(std::cin, line);
if (line.size() == 0)
{
break;
}
std::unique_ptr<lexer::Lexer> l = lexer::New(line);
for (token::Token tok = l->NextToken(); tok.Type != token::types::EndOF; tok = l->NextToken())
{
std::cout << tok << std::endl;
}
}
}
}
main主程序
int main()
{
char *user = getlogin();
std::cout << "Hello " << user << "! This is the Monkey-CPP programming language!" << std::endl;
std::cout << "Feel free to type in commands" << std::endl;
repl::Start();
return 0;
}
总结
最终运行情况:
Hello lesliezhu! This is the Monkey-CPP programming language!
Feel free to type in commands
>> let add = fn(x, y) { x + y; };
{Type:LET Literal:let}
{Type:IDENT Literal:add}
{Type:= Literal:=}
{Type:FUNCTION Literal:fn}
{Type:( Literal:(}
{Type:IDENT Literal:x}
{Type:, Literal:,}
{Type:IDENT Literal:y}
{Type:) Literal:)}
{Type:{ Literal:{}
{Type:IDENT Literal:x}
{Type:+ Literal:+}
{Type:IDENT Literal:y}
{Type:; Literal:;}
{Type:} Literal:}}
{Type:; Literal:;}
>>
发表评论