1 /** 2 Copyright: Copyright (c) 2017-2019 Andrey Penechko. 3 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0). 4 Authors: Andrey Penechko. 5 */ 6 /// Lexer 7 module vox.fe.passes.lexer; 8 9 import std.format : formattedWrite; 10 import std.string : format; 11 import std.range : repeat; 12 import std.stdio; 13 14 import vox.all; 15 16 17 void pass_lexer(ref CompilationContext ctx, CompilePassPerModule[] subPasses) 18 { 19 foreach (ref SourceFileInfo file; ctx.files.data) 20 { 21 file.firstTokenIndex = TokenIndex(ctx.tokenLocationBuffer.uintLength); 22 23 Lexer lexer = Lexer(&ctx, ctx.sourceBuffer.data, &ctx.tokenBuffer, &ctx.tokenLocationBuffer); 24 lexer.position = file.start; 25 26 lexer.lex(); 27 28 ctx.numLinesLexed += lexer.line; 29 30 if (ctx.printLexemes) { 31 writefln("// Lexemes `%s`", file.name); 32 Token tok = Token(TokenType.init, file.firstTokenIndex); 33 do 34 { 35 tok.type = ctx.tokenBuffer[tok.index]; 36 auto loc = ctx.tokenLocationBuffer[tok.index]; 37 writefln("%s %s, `%s`", tok, loc, loc.getTokenString(ctx.sourceBuffer.data)); 38 ++tok.index; 39 } 40 while(tok.type != TokenType.EOI); 41 } 42 } 43 } 44 45 46 /// Start of input 47 enum char SOI_CHAR = '\2'; 48 /// End of input 49 enum char EOI_CHAR = '\3'; 50 51 immutable string[] keyword_strings = ["auto","bool","true","false","alias","break","continue","do","else", 52 "function","f32","f64","i16","i32","i64","i8","if","import","module","isize","return","struct","union","u16","u32", 53 "u64","u8","usize","void","noreturn","while","for","switch","cast","enum","null"]; 54 enum NUM_KEYWORDS = keyword_strings.length; 55 immutable TokenType[NUM_KEYWORDS] keyword_tokens = [TT.TYPE_AUTO,TT.TYPE_BOOL,TT.TRUE_LITERAL,TT.FALSE_LITERAL, 56 TT.ALIAS_SYM, TT.BREAK_SYM,TT.CONTINUE_SYM,TT.DO_SYM,TT.ELSE_SYM,TT.FUNCTION_SYM,TT.TYPE_F32, 57 TT.TYPE_F64,TT.TYPE_I16, TT.TYPE_I32,TT.TYPE_I64,TT.TYPE_I8,TT.IF_SYM,TT.IMPORT_SYM,TT.MODULE_SYM, 58 TT.TYPE_ISIZE,TT.RETURN_SYM, TT.STRUCT_SYM,TT.UNION_SYM,TT.TYPE_U16,TT.TYPE_U32,TT.TYPE_U64,TT.TYPE_U8,TT.TYPE_USIZE, 59 TT.TYPE_VOID,TT.TYPE_NORETURN,TT.WHILE_SYM,TT.FOR_SYM,TT.SWITCH_SYM,TT.CAST,TT.ENUM,TT.NULL]; 60 61 struct Lexer 62 { 63 CompilationContext* context; 64 const(char)[] inputChars; // contains data of all files 65 Arena!TokenType* outputTokens; 66 Arena!SourceLocation* outputTokenLocations; 67 68 private dchar c; // current symbol 69 70 private uint position; // offset of 'c' in input 71 private uint line; // line of 'c' 72 private uint column; // column of 'c' 73 74 private uint startPos; // offset of first token byte in input 75 private uint startLine; // line of first token byte 76 private uint startCol; // column of first token byte 77 78 void lex() 79 { 80 while (true) 81 { 82 TokenType tokType = nextToken(); 83 84 outputTokens.put(tokType); 85 set_loc(); 86 87 if (tokType == TokenType.EOI) return; 88 } 89 } 90 91 private void prevChar() 92 { 93 --position; 94 --column; 95 c = inputChars[position]; 96 } 97 98 private void nextChar() 99 { 100 ++position; 101 ++column; 102 c = inputChars[position]; 103 } 104 105 private void set_loc() 106 { 107 outputTokenLocations.put(SourceLocation(startPos, position, startLine, startCol)); 108 } 109 110 int opApply(scope int delegate(TokenType) dg) 111 { 112 TokenType tok; 113 while ((tok = nextToken()) != TokenType.EOI) 114 if (int res = dg(tok)) 115 return res; 116 return 0; 117 } 118 119 TokenType nextToken() 120 { 121 c = inputChars[position]; 122 123 while (true) 124 { 125 startPos = position; 126 startLine = line; 127 startCol = column; 128 129 switch(c) 130 { 131 case SOI_CHAR: 132 // manual nextChar, because we don't want to advance column 133 ++position; 134 c = inputChars[position]; 135 return TT.SOI; 136 137 case EOI_CHAR: return TT.EOI; 138 case '\t': nextChar; continue; 139 case '\n': lex_EOLN(); continue; 140 case '\r': lex_EOLR(); continue; 141 case ' ' : nextChar; continue; 142 case '!' : nextChar; return lex_multi_equal2(TT.NOT, TT.NOT_EQUAL); 143 case '%' : nextChar; return lex_multi_equal2(TT.PERCENT, TT.PERCENT_EQUAL); 144 case '&' : nextChar; return lex_multi_equal2_3('&', TT.AND, TT.AND_EQUAL, TT.AND_AND); 145 case '(' : nextChar; return TT.LPAREN; 146 case ')' : nextChar; return TT.RPAREN; 147 case '*' : nextChar; return lex_multi_equal2(TT.STAR, TT.STAR_EQUAL); 148 case '+' : nextChar; return lex_multi_equal2_3('+', TT.PLUS, TT.PLUS_EQUAL, TT.PLUS_PLUS); 149 case ',' : nextChar; return TT.COMMA; 150 case '-' : nextChar; return lex_multi_equal2_3('-', TT.MINUS, TT.MINUS_EQUAL, TT.MINUS_MINUS); 151 case '.' : nextChar; 152 if (c == '.') { nextChar; 153 if (c == '.') { nextChar; 154 return TT.DOT_DOT_DOT; 155 } 156 return TT.DOT_DOT; 157 } 158 return TT.DOT; 159 case '\"': nextChar; return lex_QUOTE_QUOTE(); 160 case '\'': nextChar; return lex_QUOTE(); 161 case '/' : return lex_SLASH(); 162 case '0' : return lex_ZERO(); 163 case '1' : ..case '9': return lex_DIGIT(); 164 case ':' : nextChar; return TT.COLON; 165 case ';' : nextChar; return TT.SEMICOLON; 166 case '<' : nextChar; 167 if (c == '<') { nextChar; 168 if (c == '=') { nextChar; 169 return TT.LESS_LESS_EQUAL; 170 } 171 return TT.LESS_LESS; 172 } 173 if (c == '=') { nextChar; 174 return TT.LESS_EQUAL; 175 } 176 return TT.LESS; 177 case '=' : nextChar; return lex_multi_equal2(TT.EQUAL, TT.EQUAL_EQUAL); 178 case '?' : nextChar; return TT.QUESTION; 179 case '>' : nextChar; 180 if (c == '=') { nextChar; 181 return TT.MORE_EQUAL; 182 } 183 if (c == '>') { nextChar; 184 if (c == '>') { nextChar; 185 if (c == '=') { nextChar; 186 return TT.MORE_MORE_MORE_EQUAL; 187 } 188 return TT.MORE_MORE_MORE; 189 } 190 if (c == '=') { nextChar; 191 return TT.MORE_MORE_EQUAL; 192 } 193 return TT.MORE_MORE; 194 } 195 return TT.MORE; 196 //case '?' : nextChar; return TT.QUESTION; 197 case '@' : nextChar; return TT.AT; 198 case '#' : nextChar; return lex_HASH(); 199 case '$' : nextChar; return lex_DOLLAR(); 200 case 'A' : ..case 'Z': return lex_LETTER(); 201 case '[' : nextChar; return TT.LBRACKET; 202 case '\\': nextChar; return TT.BACKSLASH; 203 case ']' : nextChar; return TT.RBRACKET; 204 case '^' : nextChar; return lex_multi_equal2(TT.XOR, TT.XOR_EQUAL); 205 case '_' : return lex_LETTER(); 206 case 'a' : ..case 'z': return lex_LETTER(); 207 case '{' : nextChar; return TT.LCURLY; 208 case '|' : nextChar; return lex_multi_equal2_3('|', TT.OR, TT.OR_EQUAL, TT.OR_OR); 209 case '}' : nextChar; return TT.RCURLY; 210 case '~' : nextChar; return lex_multi_equal2(TT.TILDE, TT.TILDE_EQUAL); 211 default : nextChar; return TT.INVALID; 212 } 213 } 214 } 215 216 private void lex_EOLR() // \r[\n] 217 { 218 nextChar; 219 if (c == '\n') nextChar; 220 onNewLine; 221 } 222 223 private void lex_EOLN() // \n 224 { 225 nextChar; 226 onNewLine; 227 } 228 229 private void onNewLine() 230 { 231 ++line; 232 column = 0; 233 } 234 235 // Lex X= tokens 236 private TokenType lex_multi_equal2(TokenType single_tok, TokenType eq_tok) 237 { 238 if (c == '=') { 239 nextChar; 240 return eq_tok; 241 } 242 return single_tok; 243 } 244 245 private TokenType lex_multi_equal2_3(dchar chr, TokenType single_tok, TokenType eq_tok, TokenType double_tok) 246 { 247 if (c == chr) { nextChar; 248 return double_tok; 249 } 250 if (c == '=') { nextChar; 251 return eq_tok; 252 } 253 return single_tok; 254 } 255 256 private noreturn lexError(Args...)(TT type, string format, Args args) { 257 outputTokens.put(type); 258 set_loc(); 259 TokenIndex lastToken = TokenIndex(cast(uint)outputTokens.length-1); 260 context.unrecoverable_error(lastToken, format, args); 261 } 262 263 private TokenType lex_SLASH() // / 264 { 265 nextChar; 266 if (c == '/') 267 { 268 consumeLine(); 269 return TT.COMMENT; 270 } 271 if (c == '*') 272 { 273 nextChar; 274 while (true) 275 { 276 switch(c) 277 { 278 case EOI_CHAR: 279 lexError(TT.COMMENT, "Unterminated multiline comment"); 280 281 case '\n': lex_EOLN(); continue; 282 case '\r': lex_EOLR(); continue; 283 case '*': 284 nextChar; 285 if (c == '/') { 286 nextChar; 287 return TT.COMMENT; 288 } 289 break; 290 default: break; 291 } 292 nextChar; 293 } 294 } 295 if (c == '=') { nextChar; 296 return TT.SLASH_EQUAL; 297 } 298 return TT.SLASH; 299 } 300 301 private TokenType lex_QUOTE_QUOTE() // " 302 { 303 while (true) 304 { 305 switch(c) 306 { 307 case EOI_CHAR: 308 lexError(TT.STRING_LITERAL, "Unexpected end of input inside string literal"); 309 310 case '\\': 311 nextChar; 312 lexEscapeSequence(); 313 break; 314 315 case '\n': lex_EOLN(); continue; 316 case '\r': lex_EOLR(); continue; 317 case '\"': 318 nextChar; // skip " 319 return TT.STRING_LITERAL; 320 default: nextChar; break; 321 } 322 } 323 } 324 325 private void lexEscapeSequence() { 326 switch(c) 327 { 328 case '\'': 329 case '"': 330 case '?': 331 case '\\': 332 case '0': 333 case 'a': 334 case 'b': 335 case 'f': 336 case 'n': 337 case 'r': 338 case 't': 339 case 'v': 340 nextChar; 341 break; 342 343 case 'x': 344 nextChar; // skip x 345 uint numChars = consumeHexadecimal; 346 if (numChars < 2) 347 lexError(TT.INVALID, "Invalid escape sequence"); 348 break; 349 case 'u': 350 nextChar; // skip u 351 uint numChars = consumeHexadecimal; 352 if (numChars < 4) 353 lexError(TT.INVALID, "Invalid escape sequence"); 354 break; 355 case 'U': 356 nextChar; // skip U 357 uint numChars = consumeHexadecimal; 358 if (numChars < 8) 359 lexError(TT.INVALID, "Invalid escape sequence"); 360 break; 361 default: 362 lexError(TT.INVALID, "Invalid escape sequence"); 363 } 364 } 365 366 private TokenType lex_QUOTE() // ' 367 { 368 switch(c) 369 { 370 case EOI_CHAR: 371 lexError(TT.CHAR_LITERAL, "Unexpected end of input inside char literal"); 372 373 case '\\': 374 nextChar; 375 lexEscapeSequence(); 376 break; 377 378 case '\n': lex_EOLN(); break; 379 case '\r': lex_EOLR(); break; 380 default: 381 nextChar; 382 break; 383 } 384 if (c == '\'') { 385 nextChar; 386 return TT.CHAR_LITERAL; 387 } else { 388 lexError(TT.CHAR_LITERAL, "Invalid char literal"); 389 } 390 } 391 392 private TokenType lex_ZERO() // 0 393 { 394 nextChar; 395 396 if (c == 'x' || c == 'X') 397 { 398 nextChar; 399 consumeHexadecimal(); 400 if (c == 'i' || c == 'u') skipIntSuffix; 401 return TT.INT_HEX_LITERAL; 402 } 403 else if (c == 'b' || c == 'B') 404 { 405 nextChar; 406 consumeBinary(); 407 if (c == 'i' || c == 'u') skipIntSuffix; 408 return TT.INT_BIN_LITERAL; 409 } 410 else 411 { 412 return consumeDecimal(); 413 } 414 } 415 416 private TokenType lex_DIGIT() // 1-9 417 { 418 nextChar; 419 return consumeDecimal(); 420 } 421 422 423 private TokenType lex_DOLLAR() // $ 424 { 425 switch (c) 426 { 427 case 'a': 428 if (match("alias")) { return TT.TYPE_ALIAS; } break; 429 case 't': 430 if (match("type")) { return TT.TYPE_TYPE; } break; 431 default: break; 432 } 433 consumeId(); 434 return TT.CASH_IDENTIFIER; 435 } 436 437 private TokenType lex_HASH() // # 438 { 439 switch (c) 440 { 441 case 'i': 442 nextChar; 443 switch(c) { 444 case 'f': if (match("f")) { return TT.HASH_IF; } break; 445 case 'n': if (match("nline")) { return TT.HASH_INLINE; } break; 446 default: break; 447 } 448 break; 449 case 'a': 450 if (match("assert")) { return TT.HASH_ASSERT; } break; 451 case 'f': 452 if (match("foreach")) { return TT.HASH_FOREACH; } break; 453 case 'v': 454 if (match("version")) { return TT.HASH_VERSION; } break; 455 default: break; 456 } 457 lexError(TT.INVALID, "Invalid # identifier"); 458 } 459 460 private TokenType lex_LETTER() // a-zA-Z_ 461 { 462 switch (c) 463 { 464 case '_': 465 nextChar; // skip _ 466 if (c != '_') break; 467 nextChar; // skip _ 468 if (c == 'F') { 469 nextChar; // skip F 470 if (match("ILE__")) return TT.SPECIAL_KW; // __FILE__ 471 if (match("UNCTION_NAME__")) return TT.SPECIAL_KW; // __FUNCTION_NAME__ 472 } 473 if (c == 'L' && match("LINE__")) return TT.SPECIAL_KW; // __LINE__ 474 if (c == 'M' && match("MODULE_NAME__")) return TT.SPECIAL_KW; // __MODULE_NAME__ 475 break; 476 case 'a': 477 nextChar; // skip a 478 if (match("lias")) { return TT.ALIAS_SYM; } 479 if (match("uto")) { return TT.TYPE_AUTO; } 480 break; 481 case 'b': 482 nextChar; 483 if (c == 'o' && match("ool")) { return TT.TYPE_BOOL; } 484 else if (c == 'r' && match("reak")) { return TT.BREAK_SYM; } 485 break; 486 case 'c': 487 nextChar; 488 if (c == 'o' && match("ontinue")) { return TT.CONTINUE_SYM; } 489 else if (c == 'a' && match("ast")) { return TT.CAST; } 490 break; 491 case 'd': if (match("do")) { return TT.DO_SYM; } break; 492 case 'e': 493 nextChar; 494 if (c == 'l' && match("lse")) { return TT.ELSE_SYM; } 495 else if (c == 'n' && match("num")) { return TT.ENUM; } 496 break; 497 case 'f': 498 nextChar; 499 if (c == 'a' && match("alse")) { return TT.FALSE_LITERAL; } 500 if (c == '3' && match("32")) { return TT.TYPE_F32; } 501 if (c == '6' && match("64")) { return TT.TYPE_F64; } 502 if (c == 'o' && match("or")) { return TT.FOR_SYM; } 503 if (c == 'u' && match("unction")) { return TT.FUNCTION_SYM; } 504 break; 505 case 'i': 506 nextChar; 507 switch(c) { 508 case '1': if (match("16")) { return TT.TYPE_I16; } break; 509 case '3': if (match("32")) { return TT.TYPE_I32; } break; 510 case '6': if (match("64")) { return TT.TYPE_I64; } break; 511 case '8': if (match("8")) { return TT.TYPE_I8; } break; 512 case 's': if (match("size")) { return TT.TYPE_ISIZE; } break; 513 case 'f': if (match("f")) { return TT.IF_SYM; } break; 514 case 'm': if (match("mport")) { return TT.IMPORT_SYM; } break; 515 default: break; 516 } 517 break; 518 case 'n': 519 nextChar; 520 if (match("ull")) { return TT.NULL; } 521 if (match("oreturn")) { return TT.TYPE_NORETURN; } 522 break; 523 case 'm': if (match("module")) { return TT.MODULE_SYM; } break; 524 case 'r': if (match("return")) { return TT.RETURN_SYM; } break; 525 case 's': 526 nextChar; 527 if (match("truct")) { return TT.STRUCT_SYM; } 528 else if (match("witch")) { return TT.SWITCH_SYM; } 529 break; 530 case 'u': 531 nextChar; 532 switch(c) { 533 case '1': if (match("16")) { return TT.TYPE_U16; } break; 534 case '3': if (match("32")) { return TT.TYPE_U32; } break; 535 case '6': if (match("64")) { return TT.TYPE_U64; } break; 536 case '8': if (match("8")) { return TT.TYPE_U8; } break; 537 case 'n': if (match("nion")){ return TT.UNION_SYM; } break; 538 case 's': if (match("size")) { return TT.TYPE_USIZE; } break; 539 default: break; 540 } 541 break; 542 case 't': if (match("true")) { return TT.TRUE_LITERAL; } break; 543 case 'v': if (match("void")) { return TT.TYPE_VOID; } break; 544 case 'w': if (match("while")) { return TT.WHILE_SYM; } break; 545 default: break; 546 } 547 548 consumeId(); 549 return TT.IDENTIFIER; 550 } 551 552 // Does not reset in case of mismatch, so we continue consuming chars as regular identifier 553 private bool match(string identifier) 554 { 555 uint index = 0; 556 while (identifier[index] == c) 557 { 558 nextChar; 559 ++index; 560 if (index == identifier.length) 561 { 562 // check that no valid symbol follow this id. ifff for if id. 563 if (isIdSecond(c)) return false; 564 return true; 565 } 566 } 567 return false; 568 } 569 570 private void consumeId() 571 { 572 while (isIdSecond(c)) nextChar; 573 } 574 575 private TokenType consumeDecimal() 576 { 577 // skip initial decimal literal 578 while (isNumSecond(c)) nextChar; 579 580 bool isFloat = false; 581 582 if (c == 'f') { 583 skipFloatSuffix; 584 return TT.FLOAT_DEC_LITERAL; 585 } else if (c == 'i' || c == 'u') { 586 skipIntSuffix; 587 return TT.INT_DEC_LITERAL; 588 } 589 590 // check for "." followed by decimal 591 if (c == '.') { 592 nextChar; // skip . 593 if (!isNumFirst(c)) { 594 prevChar(); // revert . 595 return TT.INT_DEC_LITERAL; 596 } 597 // skip second decimal literal 598 while (isNumSecond(c)) nextChar; 599 isFloat = true; 600 } 601 602 // check for exponent 603 if (c == 'e' || c == 'E') { 604 nextChar; // skip eE 605 if (c == '+' || c == '-') nextChar; // skip -+ 606 if (!isNumFirst(c)) lexError(TT.INVALID, "Invalid char after exponent of float literal. Expected digit, got '%s'", c); 607 // skip exponent 608 while (isNumSecond(c)) nextChar; 609 if (c == 'f') skipFloatSuffix; 610 return TT.FLOAT_DEC_LITERAL; 611 } else if (c == 'f') { 612 skipFloatSuffix; 613 return TT.FLOAT_DEC_LITERAL; 614 } 615 616 if (isFloat) return TT.FLOAT_DEC_LITERAL; 617 return TT.INT_DEC_LITERAL; 618 } 619 620 private void skipFloatSuffix() { 621 // f32/f64 622 nextChar; // skip f 623 if (c == '3') { 624 nextChar; // skip 3 625 if (c == '2') { 626 nextChar; // skip 2 627 } else lexError(TT.INVALID, "Invalid char after `f3` of float literal. Expected f32 or f64 suffix, got '%s'", c); 628 } else if (c == '6') { 629 nextChar; // skip 3 630 if (c == '4') { 631 nextChar; // skip 4 632 } else lexError(TT.INVALID, "Invalid char after `f6` of float literal. Expected f32 or f64 suffix, got '%s'", c); 633 } 634 else lexError(TT.INVALID, "Invalid char after `f` of float literal. Expected f32 or f64 suffix, got '%s'", c); 635 } 636 637 private void skipIntSuffix() { 638 // i8/i16/i32/i64/u8/u16/u32/u64 639 dchar intChar = c; 640 nextChar; // skip i/u 641 switch(c) 642 { 643 case '1': 644 nextChar; // skip 1 645 if (c == '6') { 646 nextChar; // skip 6 647 return; 648 } else lexError(TT.INVALID, "Invalid char after `%1$s1` of int literal. Expected %1$s16 suffix, got '%1$s1%s'", intChar, c); 649 case '3': 650 nextChar; // skip 3 651 if (c == '2') { 652 nextChar; // skip 2 653 return; 654 } else lexError(TT.INVALID, "Invalid char after `%1$s3` of int literal. Expected %1$s32 suffix, got '%1$s3%s'", intChar, c); 655 case '6': 656 nextChar; // skip 6 657 if (c == '4') { 658 nextChar; // skip 4 659 return; 660 } else lexError(TT.INVALID, "Invalid char after `%1$s6` of int literal. Expected %1$s64 suffix, got '%1$s6%s'", intChar, c); 661 case '8': 662 nextChar; // skip 8 663 return; 664 default: lexError(TT.INVALID, "Invalid char after `%1$s` of int literal. Expected %1$s8/%1$s16/%1$s32/%1$s64 suffix, got '%1$s%s'", intChar, c); 665 } 666 } 667 668 private uint consumeHexadecimal() 669 { 670 uint count; 671 while (true) 672 { 673 if ('0' <= c && c <= '9') { 674 } else if ('a' <= c && c <= 'f') { 675 } else if ('A' <= c && c <= 'F') { 676 } else if (c != '_') return count; 677 nextChar; 678 ++count; 679 } 680 } 681 682 private void consumeBinary() 683 { 684 while (true) 685 { 686 if (c == '0' || c == '1') { 687 } else if (c != '_') return; 688 nextChar; 689 } 690 } 691 692 private void consumeLine() 693 { 694 while (true) 695 { 696 switch(c) 697 { 698 case EOI_CHAR: return; 699 case '\n': lex_EOLN(); return; 700 case '\r': lex_EOLR(); return; 701 default: break; 702 } 703 nextChar; 704 } 705 } 706 } 707 708 // strRepr is string representation of a single char, without ' around 709 dchar escapeToChar(const(char)[] strRepr) { 710 import std.conv : to; 711 switch (strRepr[0]) { 712 case '\'': return '\''; 713 case '"': return '\"'; 714 case '?': return '\?'; 715 case '\\': return '\\'; 716 case '0': return '\0'; 717 case 'a': return '\a'; 718 case 'b': return '\b'; 719 case 'f': return '\f'; 720 case 'n': return '\n'; 721 case 'r': return '\r'; 722 case 't': return '\t'; 723 case 'v': return '\v'; 724 case 'x': return strRepr[1..$].to!uint(16); 725 case 'u': return strRepr[1..$].to!uint(16); 726 case 'U': return strRepr[1..$].to!uint(16); 727 default: assert(false, strRepr); 728 } 729 } 730 731 dchar getCharValue(const(char)[] strRepr) { 732 if (strRepr[0] == '\\') return escapeToChar(strRepr[1..$]); 733 assert(strRepr.length == 1); 734 return strRepr[0]; 735 } 736 737 // Only handles valid strings 738 // We copy it into buffer, then run through it modifing in-place 739 string handleEscapedString(ref Arena!ubyte sink, const(char)[] str) 740 { 741 import std.conv : to; 742 char* dstStart = cast(char*)sink.nextPtr; 743 sink.put(cast(ubyte[])str); 744 sink.put(0); // we will look for this 0 to end the loop 745 char* src = cast(char*)dstStart; 746 char* dst = cast(char*)dstStart; 747 748 loop: 749 while(*src) // look for \0 we put into buffer 750 { 751 if (*src == '\\') 752 { 753 ++src; // skip \ 754 755 // Read escaped char 756 dchar c; 757 switch (*src) { 758 case '\'': c = '\''; break; 759 case '"': c = '\"'; break; 760 case '?': c = '\?'; break; 761 case '\\': c = '\\'; break; 762 case '0': c = '\0'; break; 763 case 'a': c = '\a'; break; 764 case 'b': c = '\b'; break; 765 case 'f': c = '\f'; break; 766 case 'n': c = '\n'; break; 767 case 'r': c = '\r'; break; 768 case 't': c = '\t'; break; 769 case 'v': c = '\v'; break; 770 case 'x': 771 c = (cast(char[])src[1..3]).to!uint(16); src += 3; 772 *dst++ = cast(ubyte)c; 773 continue loop; // skip rest, as this represents byte value, not unicode char 774 case 'u': c = (cast(char[])src[1..5]).to!uint(16); src += 4; break; 775 case 'U': c = (cast(char[])src[1..9]).to!uint(16); src += 8; break; 776 default: assert(false, "Invalid escape sequence"); 777 } 778 // For bigger cases (u, U) we inrement additionally. (x) is not handled here 779 ++src; 780 781 // Write utf-8 back 782 if (c < 0x80) { 783 *dst++ = cast(ubyte)c; 784 } else if (c < 0x800) { 785 *dst++ = 0xC0 | cast(ubyte)(c >> 6); 786 *dst++ = 0x80 | (c & 0x3f); 787 } else if (c < 0x10000) { 788 *dst++ = 0xE0 | cast(ubyte)(c >> 12); 789 *dst++ = 0x80 | ((c >> 6) & 0x3F); 790 *dst++ = 0x80 | (c & 0x3f); 791 } else if (c < 0x110000) { 792 *dst++ = 0xF0 | cast(ubyte)(c >> 18); 793 *dst++ = 0x80 | ((c >> 12) & 0x3F); 794 *dst++ = 0x80 | ((c >> 6) & 0x3F); 795 *dst++ = 0x80 | (c & 0x3f); 796 } 797 } 798 else // non-escaped char. Just copy 799 { 800 *dst++ = *src++; 801 } 802 } 803 size_t len = cast(size_t)(dst - dstStart); 804 sink.unput(str.length + 1 - len); // unput \0 too 805 return cast(string)dstStart[0..len]; 806 } 807 808 private bool isIdSecond(dchar chr) pure nothrow { 809 return 810 '0' <= chr && chr <= '9' || 811 'a' <= chr && chr <= 'z' || 812 'A' <= chr && chr <= 'Z' || 813 chr == '_'; 814 } 815 816 private bool isNumFirst(dchar chr) pure nothrow { 817 return '0' <= chr && chr <= '9'; 818 } 819 820 private bool isNumSecond(dchar chr) pure nothrow { 821 return '0' <= chr && chr <= '9' || chr == '_'; 822 } 823 824 825 unittest 826 { 827 CompilationContext ctx; 828 ubyte[64] tokenBuffer; 829 ubyte[64] locs; 830 831 Lexer makeLexer(string input) { 832 ctx.tokenBuffer.setBuffer(tokenBuffer[], 64); 833 ctx.tokenLocationBuffer.setBuffer(locs[], 64); 834 return Lexer(&ctx, input~EOI_CHAR, &ctx.tokenBuffer, &ctx.tokenLocationBuffer); 835 } 836 837 foreach(i, string keyword; keyword_strings) 838 { 839 Lexer lexer = makeLexer(keyword); 840 TokenType token = lexer.nextToken; 841 assert(token == keyword_tokens[i], 842 format("For %s expected %s got %s", keyword, keyword_tokens[i], token)); 843 } 844 845 foreach(i, string keyword; keyword_strings) 846 { 847 Lexer lexer = makeLexer(keyword~"A"); 848 TokenType token = lexer.nextToken; 849 assert(token == TT.IDENTIFIER); 850 } 851 852 { 853 string[] ops = ["&","&&","&=","@","\\",":",",",".","..","...", 854 "=","==",">",">=",">>",">>=",">>>",">>>=","<","<=","<<","<<=","-", 855 "-=","--","!","!=","|","|=","||","%","%=","+","+=","++","?",";","/", 856 "/=","*","*=","~","~=","^","^=","(",")","[","]","{","}",]; 857 TokenType[] tokens_ops = [TT.AND,TT.AND_AND,TT.AND_EQUAL,TT.AT,TT.BACKSLASH, 858 TT.COLON,TT.COMMA,TT.DOT,TT.DOT_DOT,TT.DOT_DOT_DOT,TT.EQUAL, 859 TT.EQUAL_EQUAL,TT.MORE,TT.MORE_EQUAL,TT.MORE_MORE, 860 TT.MORE_MORE_EQUAL,TT.MORE_MORE_MORE,TT.MORE_MORE_MORE_EQUAL, 861 TT.LESS,TT.LESS_EQUAL,TT.LESS_LESS,TT.LESS_LESS_EQUAL,TT.MINUS, 862 TT.MINUS_EQUAL,TT.MINUS_MINUS,TT.NOT,TT.NOT_EQUAL,TT.OR,TT.OR_EQUAL, 863 TT.OR_OR,TT.PERCENT,TT.PERCENT_EQUAL,TT.PLUS,TT.PLUS_EQUAL,TT.PLUS_PLUS, 864 TT.QUESTION,TT.SEMICOLON,TT.SLASH,TT.SLASH_EQUAL,TT.STAR,TT.STAR_EQUAL, 865 TT.TILDE,TT.TILDE_EQUAL,TT.XOR,TT.XOR_EQUAL,TT.LPAREN,TT.RPAREN, 866 TT.LBRACKET,TT.RBRACKET, TT.LCURLY,TT.RCURLY,]; 867 foreach(i, string op; ops) 868 { 869 Lexer lexer = makeLexer(op); 870 TokenType token = lexer.nextToken; 871 assert(token == tokens_ops[i], 872 format("For %s expected %s got %s", op, tokens_ops[i], token)); 873 } 874 } 875 876 void testNumeric(string input, TokenType tokType) 877 { 878 Lexer lexer = makeLexer(input); 879 assert(lexer.nextToken == tokType); 880 } 881 882 assert(makeLexer("_10").nextToken == TT.IDENTIFIER); 883 testNumeric("10", TT.INT_DEC_LITERAL); 884 testNumeric("1_0", TT.INT_DEC_LITERAL); 885 testNumeric("10_", TT.INT_DEC_LITERAL); 886 testNumeric("10.0", TT.FLOAT_DEC_LITERAL); 887 testNumeric("10.0e0", TT.FLOAT_DEC_LITERAL); 888 testNumeric("10.0e+0", TT.FLOAT_DEC_LITERAL); 889 testNumeric("10.0e-0", TT.FLOAT_DEC_LITERAL); 890 testNumeric("10.0E+0", TT.FLOAT_DEC_LITERAL); 891 testNumeric("10.0E-0", TT.FLOAT_DEC_LITERAL); 892 testNumeric("10e0", TT.FLOAT_DEC_LITERAL); 893 testNumeric("10E0", TT.FLOAT_DEC_LITERAL); 894 testNumeric("10e+0", TT.FLOAT_DEC_LITERAL); 895 testNumeric("10e-0", TT.FLOAT_DEC_LITERAL); 896 testNumeric("10E+0", TT.FLOAT_DEC_LITERAL); 897 testNumeric("10E-0", TT.FLOAT_DEC_LITERAL); 898 testNumeric("0xFF", TT.INT_HEX_LITERAL); 899 testNumeric("0XABCDEF0123456789", TT.INT_HEX_LITERAL); 900 testNumeric("0x1_0", TT.INT_HEX_LITERAL); 901 testNumeric("0b10", TT.INT_BIN_LITERAL); 902 testNumeric("0B10", TT.INT_BIN_LITERAL); 903 testNumeric("0b1_0", TT.INT_BIN_LITERAL); 904 905 { 906 string source = "/*\n*/test"; 907 Lexer lexer = makeLexer(source); 908 lexer.lex; 909 assert(tokenBuffer[0] == TT.COMMENT); 910 assert(ctx.tokenLocationBuffer[0].getTokenString(source) == "/*\n*/", format("%s", ctx.tokenLocationBuffer[0])); 911 assert(tokenBuffer[1] == TT.IDENTIFIER); 912 assert(ctx.tokenLocationBuffer[1].getTokenString(source) == "test"); 913 } 914 { 915 string source = "//test\nhello"; 916 Lexer lexer = makeLexer(source); 917 lexer.lex; 918 assert(tokenBuffer[0] == TT.COMMENT); 919 assert(ctx.tokenLocationBuffer[0].getTokenString(source) == "//test\n"); 920 assert(tokenBuffer[1] == TT.IDENTIFIER); 921 assert(ctx.tokenLocationBuffer[1].getTokenString(source) == "hello"); 922 } 923 { 924 string source = `"literal"`; 925 Lexer lexer = makeLexer(source); 926 lexer.lex; 927 assert(tokenBuffer[0] == TT.STRING_LITERAL); 928 assert(ctx.tokenLocationBuffer[0].getTokenString(source) == `"literal"`, format("%s", tokenBuffer[0])); 929 } 930 { 931 string source = `'@'`; 932 Lexer lexer = makeLexer(source); 933 lexer.lex; 934 assert(tokenBuffer[0] == TT.CHAR_LITERAL); 935 assert(ctx.tokenLocationBuffer[0].getTokenString(source) == `'@'`, format("%s", tokenBuffer[0])); 936 } 937 }