blob: 99a6cb7b558650d963d97565624d85a1420b8e1f [file] [log] [blame]
//===-- RustLex.cpp ------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "RustLex.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/ADT/APInt.h"
using namespace lldb_private::rust;
using namespace lldb_private;
using namespace lldb;
using namespace llvm;
void lldb_private::rust::PrintTokenKind(Stream &stream, int kind) {
if (kind < STRING) {
stream << char(kind);
} else {
switch (kind) {
case STRING:
case BYTESTRING:
case CHAR:
case BYTE:
case FLOAT:
case INTEGER:
case AS:
case TRUE:
case FALSE:
case SUPER:
case SELF:
case MUT:
case CONST:
case FN:
case SIZEOF:
case IDENTIFIER:
case INVALID:
case THATSALLFOLKS:
// May want to clean this up someday.
stream << "[TOKEN=" << kind << "]";
break;
case DOTDOT:
stream << "..";
break;
case DOTDOTEQ:
stream << "..=";
break;
case OROR:
stream << "||";
break;
case ANDAND:
stream << "&&";
break;
case EQEQ:
stream << "==";
break;
case NOTEQ:
stream << "!=";
break;
case LTEQ:
stream << "<=";
break;
case GTEQ:
stream << ">=";
break;
case LSH:
stream << "<<";
break;
case RSH:
stream << ">>";
break;
case PLUS_EQ:
stream << "+=";
break;
case MINUS_EQ:
stream << "-=";
break;
case SLASH_EQ:
stream << "/=";
break;
case STAR_EQ:
stream << "*=";
break;
case PERCENT_EQ:
stream << "%=";
break;
case RSH_EQ:
stream << ">>=";
break;
case LSH_EQ:
stream << "<<=";
break;
case AND_EQ:
stream << "&=";
break;
case OR_EQ:
stream << "|=";
break;
case XOR_EQ:
stream << "^=";
break;
case COLONCOLON:
stream << "::";
break;
case ARROW:
stream << "->";
break;
default:
stream << "!!!OOPS!!!";
break;
}
}
}
llvm::StringMap<TokenKind> *Lexer::m_keywords;
Token Lexer::Next() {
// Skip whitespace.
while (m_iter != m_end &&
// FIXME is it possible to see newlines here?
(*m_iter == ' ' || *m_iter == '\t'))
++m_iter;
if (m_iter == m_end) {
return Token(THATSALLFOLKS);
}
char c = *m_iter;
if (c >= '0' && c <= '9') {
return Number();
} else if (c == 'b') {
return MaybeByteLiteral();
} else if (c == 'r') {
return MaybeRawString();
} else if (c == '"') {
return String();
} else if (c == '\'') {
return Character();
} else {
return Operator();
}
}
bool Lexer::Lookup(const ::llvm::StringRef &str, int *result) {
::llvm::StringMap<TokenKind> *map = Keywords();
const auto &iter = map->find(str);
if (iter == map->end()) {
return false;
}
*result = iter->second;
return true;
}
Token Lexer::Operator() {
int result;
if (Remaining() >= 3 && Lookup(::llvm::StringRef(m_iter, 3), &result)) {
m_iter += 3;
return Token(result);
}
if (Remaining() >= 2 && Lookup(::llvm::StringRef(m_iter, 2), &result)) {
m_iter += 2;
return Token(result);
}
if (strchr(".,;|&=!<>+-*/%:[](){}", *m_iter) != nullptr) {
return Token(*m_iter++);
}
return Identifier();
}
bool Lexer::BasicInteger(int *radix_out, std::string *value) {
int radix = 10;
assert (m_iter != m_end);
assert (*m_iter >= '0' && *m_iter <= '9');
bool need_digit = false;
if (radix_out != nullptr && *m_iter == '0') {
// Ignore this digit and see if we have a non-decimal integer.
++m_iter;
if (m_iter == m_end) {
// Plain "0".
value->push_back('0');
if (radix_out) {
*radix_out = radix;
}
return true;
}
if (*m_iter == 'x') {
radix = 16;
need_digit = true;
++m_iter;
} else if (*m_iter == 'b') {
radix = 2;
need_digit = true;
++m_iter;
} else if (*m_iter == 'o') {
radix = 8;
need_digit = true;
++m_iter;
} else {
value->push_back('0');
}
}
for (; m_iter != m_end; ++m_iter) {
if (*m_iter == '_') {
continue;
}
if ((radix == 10 || radix == 16) && *m_iter >= '0' && *m_iter <= '9') {
// Ok.
} else if (radix == 2 && *m_iter >= '0' && *m_iter <= '1') {
// Ok.
} else if (radix == 8 && *m_iter >= '0' && *m_iter <= '7') {
// Ok.
} else if (radix == 16 && *m_iter >= 'a' && *m_iter <= 'f') {
// Ok.
} else if (radix == 16 && *m_iter >= 'A' && *m_iter <= 'F') {
// Ok.
} else {
break;
}
value->push_back(*m_iter);
need_digit = false;
}
if (radix_out) {
*radix_out = radix;
}
return !need_digit;
}
const char *Lexer::CheckSuffix(const char *const *suffixes) {
const char *suffix = nullptr;
size_t left = Remaining();
for (int i = 0; suffixes[i]; ++i) {
size_t len = strlen(suffixes[i]);
if (left >= len) {
::llvm::StringRef text(m_iter, len);
if (text == suffixes[i]) {
suffix = suffixes[i];
m_iter += len;
break;
}
}
}
return suffix;
}
int Lexer::Float(std::string *value) {
assert(m_iter != m_end && (*m_iter == '.' || *m_iter == 'e' || *m_iter == 'E'));
if (*m_iter == '.') {
++m_iter;
if (m_iter == m_end || !(*m_iter >= '0' && *m_iter <= '9')) {
// Not a floating-point number.
--m_iter;
return INTEGER;
}
value->push_back('.');
BasicInteger(nullptr, value);
}
if (m_iter == m_end || (*m_iter != 'e' && *m_iter != 'E')) {
return FLOAT;
}
value->push_back(*m_iter++);
if (m_iter == m_end) {
return INVALID;
}
if (*m_iter == '+' || *m_iter == '-') {
value->push_back(*m_iter++);
if (m_iter == m_end) {
return INVALID;
}
}
if (!(*m_iter >= '0' && *m_iter <= '9')) {
return INVALID;
}
BasicInteger(nullptr, value);
return FLOAT;
}
Token Lexer::Number() {
std::string number;
int radix;
if (!BasicInteger(&radix, &number)) {
return Token(INVALID);
}
if (m_iter != m_end && radix == 10 &&
(*m_iter == '.' || *m_iter == 'e' || *m_iter == 'E')) {
int kind = Float(&number);
if (kind == INVALID) {
return Token(INVALID);
}
if (kind == FLOAT) {
// Actually a float.
::llvm::StringRef sref(number);
double dval;
if (sref.getAsDouble(dval)) {
return Token(INVALID);
}
static const char * const float_suffixes[] = {
"f32",
"f64",
nullptr
};
const char *suffix = CheckSuffix(float_suffixes);
return Token(FLOAT, dval, suffix);
}
// Floating-point lex failed but we still have an integer.
assert(kind == INTEGER);
}
static const char * const int_suffixes[] = {
"u8",
"i8",
"u16",
"i16",
"u32",
"i32",
"u64",
"i64",
"usize",
"isize",
nullptr
};
APInt value;
::llvm::StringRef sref(number);
if (sref.getAsInteger(radix, value)) {
return Token(INVALID);
}
// FIXME maybe we should just leave it as an APInt through the whole
// process.
if (value.getNumWords() > 1) {
return Token(INVALID);
}
const char *suffix = CheckSuffix(int_suffixes);
return Token(INTEGER, value.getLimitedValue(), suffix);
}
Token Lexer::Identifier() {
assert(m_iter != m_end);
::llvm::StringRef::iterator start = m_iter;
char c = *m_iter;
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_')) {
return Token(INVALID);
}
for (++m_iter; m_iter != m_end; ++m_iter) {
char c = *m_iter;
if (! ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' ||
(c >= '0' && c <= '9'))) {
break;
}
}
::llvm::StringRef text(start, m_iter - start);
int result;
if (Lookup(text, &result)) {
return Token(result);
}
return Token(IDENTIFIER, text.str());
}
Token Lexer::MaybeByteLiteral() {
assert(*m_iter == 'b');
if (Remaining() < 2) {
return Identifier();
}
if (m_iter[1] == 'r') {
return MaybeRawString(true);
} else if (m_iter[1] == '"') {
++m_iter;
return String(true);
} else if (m_iter[1] == '\'') {
++m_iter;
return Character(true);
}
return Identifier();
}
bool Lexer::ParseHex(uint64_t *result, int min_digits, int max_digits) {
*result = 0;
int i;
for (i = 0; m_iter != m_end && i < max_digits; ++i, ++m_iter) {
uint64_t digit;
if (*m_iter >= 'a' && *m_iter <= 'f') {
digit = *m_iter - 'a' + 10;
} else if (*m_iter >= 'A' && *m_iter <= 'F') {
digit = *m_iter - 'A' + 10;
} else if (*m_iter >= '0' && *m_iter <= '9') {
digit = *m_iter - '0';
} else {
break;
}
*result = *result * 16 + digit;
}
return i >= min_digits;
}
bool Lexer::ParseEscape(uint64_t *result, bool is_byte) {
assert(*m_iter == '\\');
++m_iter;
if (m_iter == m_end) {
return false;
}
switch (*m_iter++) {
case 'x':
return ParseHex(result, 2, 2);
case 'u': {
if (is_byte) {
return false;
}
if (m_iter == m_end || *m_iter++ != '{') {
return false;
}
if (!ParseHex(result, 1, 6)) {
return false;
}
if (m_iter == m_end || *m_iter++ != '}') {
return false;
}
break;
}
case 'n':
*result = '\n';
break;
case 'r':
*result = '\r';
break;
case 't':
*result = '\t';
break;
case '\\':
*result = '\\';
break;
case '0':
*result = 0;
break;
case '\'':
*result = '\'';
break;
case '"':
*result = '"';
break;
default:
return false;
}
return true;
}
bool Lexer::AppendEscape(std::string *result, bool is_byte) {
uint64_t value;
if (!ParseEscape(&value, is_byte)) {
return false;
}
char utf8[10];
char *out = utf8;
if (!ConvertCodePointToUTF8(value, out)) {
return false;
}
result->append(utf8, out);
return true;
}
Token Lexer::Character(bool is_byte) {
assert(*m_iter == '\'');
if (++m_iter == m_end) {
return Token(INVALID);
}
uint64_t result;
if (*m_iter == '\\') {
if (!ParseEscape(&result, is_byte)) {
return Token(INVALID);
}
} else {
result = *m_iter++;
}
if (m_iter == m_end || *m_iter++ != '\'') {
return Token(INVALID);
}
return Token(is_byte ? BYTE : CHAR, result);
}
Token Lexer::MaybeRawString(bool is_byte) {
// Use a local copy so we can backtrack if need be.
::llvm::StringRef::iterator iter = m_iter;
if (is_byte) {
assert(*iter == 'b');
++iter;
}
assert(*iter == 'r');
++iter;
::llvm::StringRef::iterator before_hashes = iter;
while (iter != m_end && *iter == '#') {
++iter;
}
if (iter == m_end || *iter != '"') {
return Identifier();
}
size_t n_hashes = iter - before_hashes;
::llvm::StringRef::iterator string_start = ++iter;
for (; iter != m_end; ++iter) {
if (*iter == '"' &&
(n_hashes == 0 ||
(size_t(m_end - iter + 1) > n_hashes &&
strncmp(iter + 1, before_hashes, n_hashes) == 0))) {
break;
}
}
m_iter = iter;
if (iter == m_end) {
return Token(INVALID);
}
assert(*m_iter == '"');
++m_iter;
assert(Remaining() >= n_hashes);
m_iter += n_hashes;
return Token(is_byte ? BYTESTRING : STRING, std::string(string_start, iter));
}
Token Lexer::String(bool is_byte) {
assert(*m_iter == '"');
++m_iter;
std::string text;
while (m_iter != m_end && *m_iter != '"') {
if (*m_iter == '\\') {
if (!AppendEscape(&text, is_byte)) {
return Token(INVALID);
}
} else {
text += *m_iter++;
}
}
if (m_iter == m_end) {
return Token(INVALID);
}
assert(*m_iter == '"');
++m_iter;
return Token(is_byte ? BYTESTRING : STRING, std::move(text));
}
::llvm::StringMap<TokenKind> *Lexer::Keywords() {
if (m_keywords == nullptr) {
m_keywords = new ::llvm::StringMap<TokenKind>;
::llvm::StringMap<TokenKind> &m = *m_keywords;
m["as"] = AS;
m["true"] = TRUE;
m["false"] = FALSE;
m["super"] = SUPER;
m["self"] = SELF;
m["mut"] = MUT;
m["const"] = CONST;
m["fn"] = FN;
m["sizeof"] = SIZEOF;
m[".."] = DOTDOT;
m["..="] = DOTDOTEQ;
m["||"] = OROR;
m["|="] = OR_EQ;
m["&&"] = ANDAND;
m["&="] = AND_EQ;
m["^="] = XOR_EQ;
m["=="] = EQEQ;
m["!="] = NOTEQ;
m["<="] = LTEQ;
m[">="] = GTEQ;
m["<<"] = LSH;
m[">>"] = RSH;
m["+="] = PLUS_EQ;
m["-="] = MINUS_EQ;
m["*="] = STAR_EQ;
m["/="] = SLASH_EQ;
m["%="] = PERCENT_EQ;
m["<<="] = LSH_EQ;
m[">>="] = RSH_EQ;
m["::"] = COLONCOLON;
m["->"] = ARROW;
}
return m_keywords;
}