1 /**
2 Copyright: Copyright (c) 2017-2019 Andrey Penechko.
3 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0).
4 Authors: Andrey Penechko.
5 */
6 /// Lexer
7 module vox.fe.passes.lexer;
8 
9 import std.format : formattedWrite;
10 import std.string : format;
11 import std.range : repeat;
12 import std.stdio;
13 
14 import vox.all;
15 
16 
17 void pass_lexer(ref CompilationContext ctx, CompilePassPerModule[] subPasses)
18 {
19 	foreach (ref SourceFileInfo file; ctx.files.data)
20 	{
21 		file.firstTokenIndex = TokenIndex(ctx.tokenLocationBuffer.uintLength);
22 
23 		Lexer lexer = Lexer(&ctx, ctx.sourceBuffer.data, &ctx.tokenBuffer, &ctx.tokenLocationBuffer);
24 		lexer.position = file.start;
25 
26 		lexer.lex();
27 
28 		ctx.numLinesLexed += lexer.line;
29 
30 		if (ctx.printLexemes) {
31 			writefln("// Lexemes `%s`", file.name);
32 			Token tok = Token(TokenType.init, file.firstTokenIndex);
33 			do
34 			{
35 				tok.type = ctx.tokenBuffer[tok.index];
36 				auto loc = ctx.tokenLocationBuffer[tok.index];
37 				writefln("%s %s, `%s`", tok, loc, loc.getTokenString(ctx.sourceBuffer.data));
38 				++tok.index;
39 			}
40 			while(tok.type != TokenType.EOI);
41 		}
42 	}
43 }
44 
45 
46 /// Start of input
47 enum char SOI_CHAR = '\2';
48 /// End of input
49 enum char EOI_CHAR = '\3';
50 
51 immutable string[] keyword_strings = ["auto","bool","true","false","alias","break","continue","do","else",
52 	"function","f32","f64","i16","i32","i64","i8","if","import","module","isize","return","struct","union","u16","u32",
53 	"u64","u8","usize","void","noreturn","while","for","switch","cast","enum","null"];
54 enum NUM_KEYWORDS = keyword_strings.length;
55 immutable TokenType[NUM_KEYWORDS] keyword_tokens = [TT.TYPE_AUTO,TT.TYPE_BOOL,TT.TRUE_LITERAL,TT.FALSE_LITERAL,
56 	TT.ALIAS_SYM, TT.BREAK_SYM,TT.CONTINUE_SYM,TT.DO_SYM,TT.ELSE_SYM,TT.FUNCTION_SYM,TT.TYPE_F32,
57 	TT.TYPE_F64,TT.TYPE_I16, TT.TYPE_I32,TT.TYPE_I64,TT.TYPE_I8,TT.IF_SYM,TT.IMPORT_SYM,TT.MODULE_SYM,
58 	TT.TYPE_ISIZE,TT.RETURN_SYM, TT.STRUCT_SYM,TT.UNION_SYM,TT.TYPE_U16,TT.TYPE_U32,TT.TYPE_U64,TT.TYPE_U8,TT.TYPE_USIZE,
59 	TT.TYPE_VOID,TT.TYPE_NORETURN,TT.WHILE_SYM,TT.FOR_SYM,TT.SWITCH_SYM,TT.CAST,TT.ENUM,TT.NULL];
60 
61 struct Lexer
62 {
63 	CompilationContext* context;
64 	const(char)[] inputChars; // contains data of all files
65 	Arena!TokenType* outputTokens;
66 	Arena!SourceLocation* outputTokenLocations;
67 
68 	private dchar c; // current symbol
69 
70 	private uint position; // offset of 'c' in input
71 	private uint line; // line of 'c'
72 	private uint column; // column of 'c'
73 
74 	private uint startPos; // offset of first token byte in input
75 	private uint startLine; // line of first token byte
76 	private uint startCol; // column of first token byte
77 
78 	void lex()
79 	{
80 		while (true)
81 		{
82 			TokenType tokType = nextToken();
83 
84 			outputTokens.put(tokType);
85 			set_loc();
86 
87 			if (tokType == TokenType.EOI) return;
88 		}
89 	}
90 
91 	private void prevChar()
92 	{
93 		--position;
94 		--column;
95 		c = inputChars[position];
96 	}
97 
98 	private void nextChar()
99 	{
100 		++position;
101 		++column;
102 		c = inputChars[position];
103 	}
104 
105 	private void set_loc()
106 	{
107 		outputTokenLocations.put(SourceLocation(startPos, position, startLine, startCol));
108 	}
109 
110 	int opApply(scope int delegate(TokenType) dg)
111 	{
112 		TokenType tok;
113 		while ((tok = nextToken()) != TokenType.EOI)
114 			if (int res = dg(tok))
115 				return res;
116 		return 0;
117 	}
118 
119 	TokenType nextToken()
120 	{
121 		c = inputChars[position];
122 
123 		while (true)
124 		{
125 			startPos = position;
126 			startLine = line;
127 			startCol = column;
128 
129 			switch(c)
130 			{
131 				case SOI_CHAR:
132 					// manual nextChar, because we don't want to advance column
133 					++position;
134 					c = inputChars[position];
135 					return TT.SOI;
136 
137 				case EOI_CHAR:         return TT.EOI;
138 				case '\t': nextChar;   continue;
139 				case '\n': lex_EOLN(); continue;
140 				case '\r': lex_EOLR(); continue;
141 				case ' ' : nextChar;   continue;
142 				case '!' : nextChar; return lex_multi_equal2(TT.NOT, TT.NOT_EQUAL);
143 				case '%' : nextChar; return lex_multi_equal2(TT.PERCENT, TT.PERCENT_EQUAL);
144 				case '&' : nextChar; return lex_multi_equal2_3('&', TT.AND, TT.AND_EQUAL, TT.AND_AND);
145 				case '(' : nextChar; return TT.LPAREN;
146 				case ')' : nextChar; return TT.RPAREN;
147 				case '*' : nextChar; return lex_multi_equal2(TT.STAR, TT.STAR_EQUAL);
148 				case '+' : nextChar; return lex_multi_equal2_3('+', TT.PLUS, TT.PLUS_EQUAL, TT.PLUS_PLUS);
149 				case ',' : nextChar; return TT.COMMA;
150 				case '-' : nextChar; return lex_multi_equal2_3('-', TT.MINUS, TT.MINUS_EQUAL, TT.MINUS_MINUS);
151 				case '.' : nextChar;
152 					if (c == '.') { nextChar;
153 						if (c == '.') { nextChar;
154 							return TT.DOT_DOT_DOT;
155 						}
156 						return TT.DOT_DOT;
157 					}
158 					return TT.DOT;
159 				case '\"': nextChar; return lex_QUOTE_QUOTE();
160 				case '\'': nextChar; return lex_QUOTE();
161 				case '/' :           return lex_SLASH();
162 				case '0' :           return lex_ZERO();
163 				case '1' : ..case '9': return lex_DIGIT();
164 				case ':' : nextChar; return TT.COLON;
165 				case ';' : nextChar; return TT.SEMICOLON;
166 				case '<' : nextChar;
167 					if (c == '<') { nextChar;
168 						if (c == '=') { nextChar;
169 							return TT.LESS_LESS_EQUAL;
170 						}
171 						return TT.LESS_LESS;
172 					}
173 					if (c == '=') { nextChar;
174 						return TT.LESS_EQUAL;
175 					}
176 					return TT.LESS;
177 				case '=' : nextChar; return lex_multi_equal2(TT.EQUAL, TT.EQUAL_EQUAL);
178 				case '?' : nextChar; return TT.QUESTION;
179 				case '>' : nextChar;
180 					if (c == '=') { nextChar;
181 						return TT.MORE_EQUAL;
182 					}
183 					if (c == '>') { nextChar;
184 						if (c == '>') { nextChar;
185 							if (c == '=') { nextChar;
186 								return TT.MORE_MORE_MORE_EQUAL;
187 							}
188 							return TT.MORE_MORE_MORE;
189 						}
190 						if (c == '=') { nextChar;
191 							return TT.MORE_MORE_EQUAL;
192 						}
193 						return TT.MORE_MORE;
194 					}
195 					return TT.MORE;
196 				//case '?' : nextChar; return TT.QUESTION;
197 				case '@' : nextChar; return TT.AT;
198 				case '#' : nextChar; return lex_HASH();
199 				case '$' : nextChar; return lex_DOLLAR();
200 				case 'A' : ..case 'Z': return lex_LETTER();
201 				case '[' : nextChar; return TT.LBRACKET;
202 				case '\\': nextChar; return TT.BACKSLASH;
203 				case ']' : nextChar; return TT.RBRACKET;
204 				case '^' : nextChar; return lex_multi_equal2(TT.XOR, TT.XOR_EQUAL);
205 				case '_' : return lex_LETTER();
206 				case 'a' : ..case 'z': return lex_LETTER();
207 				case '{' : nextChar; return TT.LCURLY;
208 				case '|' : nextChar; return lex_multi_equal2_3('|', TT.OR, TT.OR_EQUAL, TT.OR_OR);
209 				case '}' : nextChar; return TT.RCURLY;
210 				case '~' : nextChar; return lex_multi_equal2(TT.TILDE, TT.TILDE_EQUAL);
211 				default  : nextChar; return TT.INVALID;
212 			}
213 		}
214 	}
215 
216 	private void lex_EOLR() // \r[\n]
217 	{
218 		nextChar;
219 		if (c == '\n') nextChar;
220 		onNewLine;
221 	}
222 
223 	private void lex_EOLN() // \n
224 	{
225 		nextChar;
226 		onNewLine;
227 	}
228 
229 	private void onNewLine()
230 	{
231 		++line;
232 		column = 0;
233 	}
234 
235 	// Lex X= tokens
236 	private TokenType lex_multi_equal2(TokenType single_tok, TokenType eq_tok)
237 	{
238 		if (c == '=') {
239 			nextChar;
240 			return eq_tok;
241 		}
242 		return single_tok;
243 	}
244 
245 	private TokenType lex_multi_equal2_3(dchar chr, TokenType single_tok, TokenType eq_tok, TokenType double_tok)
246 	{
247 		if (c == chr) { nextChar;
248 			return double_tok;
249 		}
250 		if (c == '=') { nextChar;
251 			return eq_tok;
252 		}
253 		return single_tok;
254 	}
255 
256 	private noreturn lexError(Args...)(TT type, string format, Args args) {
257 		outputTokens.put(type);
258 		set_loc();
259 		TokenIndex lastToken = TokenIndex(cast(uint)outputTokens.length-1);
260 		context.unrecoverable_error(lastToken, format, args);
261 	}
262 
263 	private TokenType lex_SLASH() // /
264 	{
265 		nextChar;
266 		if (c == '/')
267 		{
268 			consumeLine();
269 			return TT.COMMENT;
270 		}
271 		if (c == '*')
272 		{
273 			nextChar;
274 			while (true)
275 			{
276 				switch(c)
277 				{
278 					case EOI_CHAR:
279 						lexError(TT.COMMENT, "Unterminated multiline comment");
280 
281 					case '\n': lex_EOLN(); continue;
282 					case '\r': lex_EOLR(); continue;
283 					case '*':
284 						nextChar;
285 						if (c == '/') {
286 							nextChar;
287 							return TT.COMMENT;
288 						}
289 						break;
290 					default: break;
291 				}
292 				nextChar;
293 			}
294 		}
295 		if (c == '=') { nextChar;
296 			return TT.SLASH_EQUAL;
297 		}
298 		return TT.SLASH;
299 	}
300 
301 	private TokenType lex_QUOTE_QUOTE() // "
302 	{
303 		while (true)
304 		{
305 			switch(c)
306 			{
307 				case EOI_CHAR:
308 					lexError(TT.STRING_LITERAL, "Unexpected end of input inside string literal");
309 
310 				case '\\':
311 					nextChar;
312 					lexEscapeSequence();
313 					break;
314 
315 				case '\n': lex_EOLN(); continue;
316 				case '\r': lex_EOLR(); continue;
317 				case '\"':
318 					nextChar; // skip "
319 					return TT.STRING_LITERAL;
320 				default: nextChar; break;
321 			}
322 		}
323 	}
324 
325 	private void lexEscapeSequence() {
326 		switch(c)
327 		{
328 			case '\'':
329 			case '"':
330 			case '?':
331 			case '\\':
332 			case '0':
333 			case 'a':
334 			case 'b':
335 			case 'f':
336 			case 'n':
337 			case 'r':
338 			case 't':
339 			case 'v':
340 				nextChar;
341 				break;
342 
343 			case 'x':
344 				nextChar; // skip x
345 				uint numChars = consumeHexadecimal;
346 				if (numChars < 2)
347 					lexError(TT.INVALID, "Invalid escape sequence");
348 				break;
349 			case 'u':
350 				nextChar; // skip u
351 				uint numChars = consumeHexadecimal;
352 				if (numChars < 4)
353 					lexError(TT.INVALID, "Invalid escape sequence");
354 				break;
355 			case 'U':
356 				nextChar; // skip U
357 				uint numChars = consumeHexadecimal;
358 				if (numChars < 8)
359 					lexError(TT.INVALID, "Invalid escape sequence");
360 				break;
361 			default:
362 				lexError(TT.INVALID, "Invalid escape sequence");
363 		}
364 	}
365 
366 	private TokenType lex_QUOTE() // '
367 	{
368 		switch(c)
369 		{
370 			case EOI_CHAR:
371 				lexError(TT.CHAR_LITERAL, "Unexpected end of input inside char literal");
372 
373 			case '\\':
374 				nextChar;
375 				lexEscapeSequence();
376 				break;
377 
378 			case '\n': lex_EOLN(); break;
379 			case '\r': lex_EOLR(); break;
380 			default:
381 				nextChar;
382 				break;
383 		}
384 		if (c == '\'') {
385 			nextChar;
386 			return TT.CHAR_LITERAL;
387 		} else {
388 			lexError(TT.CHAR_LITERAL, "Invalid char literal");
389 		}
390 	}
391 
392 	private TokenType lex_ZERO() // 0
393 	{
394 		nextChar;
395 
396 		if (c == 'x' || c == 'X')
397 		{
398 			nextChar;
399 			consumeHexadecimal();
400 			if (c == 'i' || c == 'u') skipIntSuffix;
401 			return TT.INT_HEX_LITERAL;
402 		}
403 		else if (c == 'b' || c == 'B')
404 		{
405 			nextChar;
406 			consumeBinary();
407 			if (c == 'i' || c == 'u') skipIntSuffix;
408 			return TT.INT_BIN_LITERAL;
409 		}
410 		else
411 		{
412 			return consumeDecimal();
413 		}
414 	}
415 
416 	private TokenType lex_DIGIT() // 1-9
417 	{
418 		nextChar;
419 		return consumeDecimal();
420 	}
421 
422 
423 	private TokenType lex_DOLLAR() // $
424 	{
425 		switch (c)
426 		{
427 			case 'a':
428 				if (match("alias")) { return TT.TYPE_ALIAS; } break;
429 			case 't':
430 				if (match("type")) { return TT.TYPE_TYPE; } break;
431 			default: break;
432 		}
433 		consumeId();
434 		return TT.CASH_IDENTIFIER;
435 	}
436 
437 	private TokenType lex_HASH() // #
438 	{
439 		switch (c)
440 		{
441 			case 'i':
442 				nextChar;
443 				switch(c) {
444 					case 'f': if (match("f")) { return TT.HASH_IF; } break;
445 					case 'n': if (match("nline")) { return TT.HASH_INLINE; } break;
446 					default: break;
447 				}
448 				break;
449 			case 'a':
450 				if (match("assert")) { return TT.HASH_ASSERT; } break;
451 			case 'f':
452 				if (match("foreach")) { return TT.HASH_FOREACH; } break;
453 			case 'v':
454 				if (match("version")) { return TT.HASH_VERSION; } break;
455 			default: break;
456 		}
457 		lexError(TT.INVALID, "Invalid # identifier");
458 	}
459 
460 	private TokenType lex_LETTER() // a-zA-Z_
461 	{
462 		switch (c)
463 		{
464 			case '_':
465 				nextChar; // skip _
466 				if (c != '_') break;
467 				nextChar; // skip _
468 				if (c == 'F') {
469 					nextChar; // skip F
470 					if (match("ILE__")) return TT.SPECIAL_KW; // __FILE__
471 					if (match("UNCTION_NAME__")) return TT.SPECIAL_KW; // __FUNCTION_NAME__
472 				}
473 				if (c == 'L' && match("LINE__")) return TT.SPECIAL_KW; // __LINE__
474 				if (c == 'M' && match("MODULE_NAME__")) return TT.SPECIAL_KW; // __MODULE_NAME__
475 				break;
476 			case 'a':
477 				nextChar; // skip a
478 				if (match("lias")) { return TT.ALIAS_SYM; }
479 				if (match("uto")) { return TT.TYPE_AUTO; }
480 				break;
481 			case 'b':
482 				nextChar;
483 				if (c == 'o' && match("ool")) { return TT.TYPE_BOOL; }
484 				else if (c == 'r' && match("reak")) { return TT.BREAK_SYM; }
485 				break;
486 			case 'c':
487 				nextChar;
488 				if (c == 'o' && match("ontinue")) { return TT.CONTINUE_SYM; }
489 				else if (c == 'a' && match("ast")) { return TT.CAST; }
490 				break;
491 			case 'd': if (match("do")) { return TT.DO_SYM; } break;
492 			case 'e':
493 				nextChar;
494 				if (c == 'l' && match("lse")) { return TT.ELSE_SYM; }
495 				else if (c == 'n' && match("num")) { return TT.ENUM; }
496 				break;
497 			case 'f':
498 				nextChar;
499 				if (c == 'a' && match("alse")) { return TT.FALSE_LITERAL; }
500 				if (c == '3' && match("32")) { return TT.TYPE_F32; }
501 				if (c == '6' && match("64")) { return TT.TYPE_F64; }
502 				if (c == 'o' && match("or")) { return TT.FOR_SYM; }
503 				if (c == 'u' && match("unction")) { return TT.FUNCTION_SYM; }
504 				break;
505 			case 'i':
506 				nextChar;
507 				switch(c) {
508 					case '1': if (match("16")) { return TT.TYPE_I16; } break;
509 					case '3': if (match("32")) { return TT.TYPE_I32; } break;
510 					case '6': if (match("64")) { return TT.TYPE_I64; } break;
511 					case '8': if (match("8"))  { return TT.TYPE_I8; }  break;
512 					case 's': if (match("size")) { return TT.TYPE_ISIZE; } break;
513 					case 'f': if (match("f")) { return TT.IF_SYM; } break;
514 					case 'm': if (match("mport")) { return TT.IMPORT_SYM; } break;
515 					default: break;
516 				}
517 				break;
518 			case 'n':
519 				nextChar;
520 				if (match("ull")) { return TT.NULL; }
521 				if (match("oreturn")) { return TT.TYPE_NORETURN; }
522 				break;
523 			case 'm': if (match("module")) { return TT.MODULE_SYM; } break;
524 			case 'r': if (match("return")) { return TT.RETURN_SYM; } break;
525 			case 's':
526 				nextChar;
527 				if (match("truct")) { return TT.STRUCT_SYM; }
528 				else if (match("witch")) { return TT.SWITCH_SYM; }
529 				break;
530 			case 'u':
531 				nextChar;
532 				switch(c) {
533 					case '1': if (match("16")) { return TT.TYPE_U16; } break;
534 					case '3': if (match("32")) { return TT.TYPE_U32; } break;
535 					case '6': if (match("64")) { return TT.TYPE_U64; } break;
536 					case '8': if (match("8"))  { return TT.TYPE_U8; }  break;
537 					case 'n': if (match("nion")){ return TT.UNION_SYM; }  break;
538 					case 's': if (match("size")) { return TT.TYPE_USIZE; } break;
539 					default: break;
540 				}
541 				break;
542 			case 't': if (match("true")) { return TT.TRUE_LITERAL; } break;
543 			case 'v': if (match("void")) { return TT.TYPE_VOID; } break;
544 			case 'w': if (match("while")) { return TT.WHILE_SYM; } break;
545 			default: break;
546 		}
547 
548 		consumeId();
549 		return TT.IDENTIFIER;
550 	}
551 
552 	// Does not reset in case of mismatch, so we continue consuming chars as regular identifier
553 	private bool match(string identifier)
554 	{
555 		uint index = 0;
556 		while (identifier[index] == c)
557 		{
558 			nextChar;
559 			++index;
560 			if (index == identifier.length)
561 			{
562 				// check that no valid symbol follow this id. ifff for if id.
563 				if (isIdSecond(c)) return false;
564 				return true;
565 			}
566 		}
567 		return false;
568 	}
569 
570 	private void consumeId()
571 	{
572 		while (isIdSecond(c)) nextChar;
573 	}
574 
575 	private TokenType consumeDecimal()
576 	{
577 		// skip initial decimal literal
578 		while (isNumSecond(c)) nextChar;
579 
580 		bool isFloat = false;
581 
582 		if (c == 'f') {
583 			skipFloatSuffix;
584 			return TT.FLOAT_DEC_LITERAL;
585 		} else if (c == 'i' || c == 'u') {
586 			skipIntSuffix;
587 			return TT.INT_DEC_LITERAL;
588 		}
589 
590 		// check for "." followed by decimal
591 		if (c == '.') {
592 			nextChar; // skip .
593 			if (!isNumFirst(c)) {
594 				prevChar(); // revert .
595 				return TT.INT_DEC_LITERAL;
596 			}
597 			// skip second decimal literal
598 			while (isNumSecond(c)) nextChar;
599 			isFloat = true;
600 		}
601 
602 		// check for exponent
603 		if (c == 'e' || c == 'E') {
604 			nextChar; // skip eE
605 			if (c == '+' || c == '-') nextChar; // skip -+
606 			if (!isNumFirst(c)) lexError(TT.INVALID, "Invalid char after exponent of float literal. Expected digit, got '%s'", c);
607 			// skip exponent
608 			while (isNumSecond(c)) nextChar;
609 			if (c == 'f') skipFloatSuffix;
610 			return TT.FLOAT_DEC_LITERAL;
611 		} else if (c == 'f') {
612 			skipFloatSuffix;
613 			return TT.FLOAT_DEC_LITERAL;
614 		}
615 
616 		if (isFloat) return TT.FLOAT_DEC_LITERAL;
617 		return TT.INT_DEC_LITERAL;
618 	}
619 
620 	private void skipFloatSuffix() {
621 		// f32/f64
622 		nextChar; // skip f
623 		if (c == '3') {
624 			nextChar; // skip 3
625 			if (c == '2') {
626 				nextChar; // skip 2
627 			} else lexError(TT.INVALID, "Invalid char after `f3` of float literal. Expected f32 or f64 suffix, got '%s'", c);
628 		} else if (c == '6') {
629 			nextChar; // skip 3
630 			if (c == '4') {
631 				nextChar; // skip 4
632 			} else lexError(TT.INVALID, "Invalid char after `f6` of float literal. Expected f32 or f64 suffix, got '%s'", c);
633 		}
634 		else lexError(TT.INVALID, "Invalid char after `f` of float literal. Expected f32 or f64 suffix, got '%s'", c);
635 	}
636 
637 	private void skipIntSuffix() {
638 		// i8/i16/i32/i64/u8/u16/u32/u64
639 		dchar intChar = c;
640 		nextChar; // skip i/u
641 		switch(c)
642 		{
643 			case '1':
644 				nextChar; // skip 1
645 				if (c == '6') {
646 					nextChar; // skip 6
647 					return;
648 				} else lexError(TT.INVALID, "Invalid char after `%1$s1` of int literal. Expected %1$s16 suffix, got '%1$s1%s'", intChar, c);
649 			case '3':
650 				nextChar; // skip 3
651 				if (c == '2') {
652 					nextChar; // skip 2
653 					return;
654 				} else lexError(TT.INVALID, "Invalid char after `%1$s3` of int literal. Expected %1$s32 suffix, got '%1$s3%s'", intChar, c);
655 			case '6':
656 				nextChar; // skip 6
657 				if (c == '4') {
658 					nextChar; // skip 4
659 					return;
660 				} else lexError(TT.INVALID, "Invalid char after `%1$s6` of int literal. Expected %1$s64 suffix, got '%1$s6%s'", intChar, c);
661 			case '8':
662 				nextChar; // skip 8
663 				return;
664 			default: lexError(TT.INVALID, "Invalid char after `%1$s` of int literal. Expected %1$s8/%1$s16/%1$s32/%1$s64 suffix, got '%1$s%s'", intChar, c);
665 		}
666 	}
667 
668 	private uint consumeHexadecimal()
669 	{
670 		uint count;
671 		while (true)
672 		{
673 			if ('0' <= c && c <= '9') {
674 			} else if ('a' <= c && c <= 'f') {
675 			} else if ('A' <= c && c <= 'F') {
676 			} else if (c != '_') return count;
677 			nextChar;
678 			++count;
679 		}
680 	}
681 
682 	private void consumeBinary()
683 	{
684 		while (true)
685 		{
686 			if (c == '0' || c == '1') {
687 			} else if (c != '_') return;
688 			nextChar;
689 		}
690 	}
691 
692 	private void consumeLine()
693 	{
694 		while (true)
695 		{
696 			switch(c)
697 			{
698 				case EOI_CHAR: return;
699 				case '\n': lex_EOLN(); return;
700 				case '\r': lex_EOLR(); return;
701 				default: break;
702 			}
703 			nextChar;
704 		}
705 	}
706 }
707 
708 // strRepr is string representation of a single char, without ' around
709 dchar escapeToChar(const(char)[] strRepr) {
710 	import std.conv : to;
711 	switch (strRepr[0]) {
712 		case '\'': return '\'';
713 		case '"': return '\"';
714 		case '?': return '\?';
715 		case '\\': return '\\';
716 		case '0': return '\0';
717 		case 'a': return '\a';
718 		case 'b': return '\b';
719 		case 'f': return '\f';
720 		case 'n': return '\n';
721 		case 'r': return '\r';
722 		case 't': return '\t';
723 		case 'v': return '\v';
724 		case 'x': return strRepr[1..$].to!uint(16);
725 		case 'u': return strRepr[1..$].to!uint(16);
726 		case 'U': return strRepr[1..$].to!uint(16);
727 		default: assert(false, strRepr);
728 	}
729 }
730 
731 dchar getCharValue(const(char)[] strRepr) {
732 	if (strRepr[0] == '\\') return escapeToChar(strRepr[1..$]);
733 	assert(strRepr.length == 1);
734 	return strRepr[0];
735 }
736 
737 // Only handles valid strings
738 // We copy it into buffer, then run through it modifing in-place
739 string handleEscapedString(ref Arena!ubyte sink, const(char)[] str)
740 {
741 	import std.conv : to;
742 	char* dstStart = cast(char*)sink.nextPtr;
743 	sink.put(cast(ubyte[])str);
744 	sink.put(0); // we will look for this 0 to end the loop
745 	char* src = cast(char*)dstStart;
746 	char* dst = cast(char*)dstStart;
747 
748 	loop:
749 	while(*src) // look for \0 we put into buffer
750 	{
751 		if (*src == '\\')
752 		{
753 			++src; // skip \
754 
755 			// Read escaped char
756 			dchar c;
757 			switch (*src) {
758 				case '\'': c = '\''; break;
759 				case '"':  c = '\"'; break;
760 				case '?':  c = '\?'; break;
761 				case '\\': c = '\\'; break;
762 				case '0':  c = '\0'; break;
763 				case 'a':  c = '\a'; break;
764 				case 'b':  c = '\b'; break;
765 				case 'f':  c = '\f'; break;
766 				case 'n':  c = '\n'; break;
767 				case 'r':  c = '\r'; break;
768 				case 't':  c = '\t'; break;
769 				case 'v':  c = '\v'; break;
770 				case 'x':
771 					c = (cast(char[])src[1..3]).to!uint(16); src += 3;
772 					*dst++ = cast(ubyte)c;
773 					continue loop; // skip rest, as this represents byte value, not unicode char
774 				case 'u':  c = (cast(char[])src[1..5]).to!uint(16); src += 4; break;
775 				case 'U':  c = (cast(char[])src[1..9]).to!uint(16); src += 8; break;
776 				default: assert(false, "Invalid escape sequence");
777 			}
778 			// For bigger cases (u, U) we inrement additionally. (x) is not handled here
779 			++src;
780 
781 			// Write utf-8 back
782 			if (c < 0x80) {
783 				*dst++ = cast(ubyte)c;
784 			} else if (c < 0x800) {
785 				*dst++ = 0xC0 | cast(ubyte)(c >> 6);
786 				*dst++ = 0x80 | (c & 0x3f);
787 			} else if (c < 0x10000) {
788 				*dst++ = 0xE0 | cast(ubyte)(c >> 12);
789 				*dst++ = 0x80 | ((c >> 6) & 0x3F);
790 				*dst++ = 0x80 | (c & 0x3f);
791 			} else if (c < 0x110000) {
792 				*dst++ = 0xF0 | cast(ubyte)(c >> 18);
793 				*dst++ = 0x80 | ((c >> 12) & 0x3F);
794 				*dst++ = 0x80 | ((c >> 6) & 0x3F);
795 				*dst++ = 0x80 | (c & 0x3f);
796 			}
797 		}
798 		else // non-escaped char. Just copy
799 		{
800 			*dst++ = *src++;
801 		}
802 	}
803 	size_t len = cast(size_t)(dst - dstStart);
804 	sink.unput(str.length + 1 - len); // unput \0 too
805 	return cast(string)dstStart[0..len];
806 }
807 
808 private bool isIdSecond(dchar chr) pure nothrow {
809 	return
810 		'0' <= chr && chr <= '9' ||
811 		'a' <= chr && chr <= 'z' ||
812 		'A' <= chr && chr <= 'Z' ||
813 		chr == '_';
814 }
815 
816 private bool isNumFirst(dchar chr) pure nothrow {
817 	return '0' <= chr && chr <= '9';
818 }
819 
820 private bool isNumSecond(dchar chr) pure nothrow {
821 	return '0' <= chr && chr <= '9' || chr == '_';
822 }
823 
824 
825 unittest
826 {
827 	CompilationContext ctx;
828 	ubyte[64] tokenBuffer;
829 	ubyte[64] locs;
830 
831 	Lexer makeLexer(string input) {
832 		ctx.tokenBuffer.setBuffer(tokenBuffer[], 64);
833 		ctx.tokenLocationBuffer.setBuffer(locs[], 64);
834 		return Lexer(&ctx, input~EOI_CHAR, &ctx.tokenBuffer, &ctx.tokenLocationBuffer);
835 	}
836 
837 	foreach(i, string keyword; keyword_strings)
838 	{
839 		Lexer lexer = makeLexer(keyword);
840 		TokenType token = lexer.nextToken;
841 		assert(token == keyword_tokens[i],
842 			format("For %s expected %s got %s", keyword, keyword_tokens[i], token));
843 	}
844 
845 	foreach(i, string keyword; keyword_strings)
846 	{
847 		Lexer lexer = makeLexer(keyword~"A");
848 		TokenType token = lexer.nextToken;
849 		assert(token == TT.IDENTIFIER);
850 	}
851 
852 	{
853 		string[] ops = ["&","&&","&=","@","\\",":",",",".","..","...",
854 			"=","==",">",">=",">>",">>=",">>>",">>>=","<","<=","<<","<<=","-",
855 			"-=","--","!","!=","|","|=","||","%","%=","+","+=","++","?",";","/",
856 			"/=","*","*=","~","~=","^","^=","(",")","[","]","{","}",];
857 		TokenType[] tokens_ops = [TT.AND,TT.AND_AND,TT.AND_EQUAL,TT.AT,TT.BACKSLASH,
858 			TT.COLON,TT.COMMA,TT.DOT,TT.DOT_DOT,TT.DOT_DOT_DOT,TT.EQUAL,
859 			TT.EQUAL_EQUAL,TT.MORE,TT.MORE_EQUAL,TT.MORE_MORE,
860 			TT.MORE_MORE_EQUAL,TT.MORE_MORE_MORE,TT.MORE_MORE_MORE_EQUAL,
861 			TT.LESS,TT.LESS_EQUAL,TT.LESS_LESS,TT.LESS_LESS_EQUAL,TT.MINUS,
862 			TT.MINUS_EQUAL,TT.MINUS_MINUS,TT.NOT,TT.NOT_EQUAL,TT.OR,TT.OR_EQUAL,
863 			TT.OR_OR,TT.PERCENT,TT.PERCENT_EQUAL,TT.PLUS,TT.PLUS_EQUAL,TT.PLUS_PLUS,
864 			TT.QUESTION,TT.SEMICOLON,TT.SLASH,TT.SLASH_EQUAL,TT.STAR,TT.STAR_EQUAL,
865 			TT.TILDE,TT.TILDE_EQUAL,TT.XOR,TT.XOR_EQUAL,TT.LPAREN,TT.RPAREN,
866 			TT.LBRACKET,TT.RBRACKET, TT.LCURLY,TT.RCURLY,];
867 		foreach(i, string op; ops)
868 		{
869 			Lexer lexer = makeLexer(op);
870 			TokenType token = lexer.nextToken;
871 			assert(token == tokens_ops[i],
872 				format("For %s expected %s got %s", op, tokens_ops[i], token));
873 		}
874 	}
875 
876 	void testNumeric(string input, TokenType tokType)
877 	{
878 		Lexer lexer = makeLexer(input);
879 		assert(lexer.nextToken == tokType);
880 	}
881 
882 	assert(makeLexer("_10").nextToken == TT.IDENTIFIER);
883 	testNumeric("10", TT.INT_DEC_LITERAL);
884 	testNumeric("1_0", TT.INT_DEC_LITERAL);
885 	testNumeric("10_", TT.INT_DEC_LITERAL);
886 	testNumeric("10.0", TT.FLOAT_DEC_LITERAL);
887 	testNumeric("10.0e0", TT.FLOAT_DEC_LITERAL);
888 	testNumeric("10.0e+0", TT.FLOAT_DEC_LITERAL);
889 	testNumeric("10.0e-0", TT.FLOAT_DEC_LITERAL);
890 	testNumeric("10.0E+0", TT.FLOAT_DEC_LITERAL);
891 	testNumeric("10.0E-0", TT.FLOAT_DEC_LITERAL);
892 	testNumeric("10e0", TT.FLOAT_DEC_LITERAL);
893 	testNumeric("10E0", TT.FLOAT_DEC_LITERAL);
894 	testNumeric("10e+0", TT.FLOAT_DEC_LITERAL);
895 	testNumeric("10e-0", TT.FLOAT_DEC_LITERAL);
896 	testNumeric("10E+0", TT.FLOAT_DEC_LITERAL);
897 	testNumeric("10E-0", TT.FLOAT_DEC_LITERAL);
898 	testNumeric("0xFF", TT.INT_HEX_LITERAL);
899 	testNumeric("0XABCDEF0123456789", TT.INT_HEX_LITERAL);
900 	testNumeric("0x1_0", TT.INT_HEX_LITERAL);
901 	testNumeric("0b10", TT.INT_BIN_LITERAL);
902 	testNumeric("0B10", TT.INT_BIN_LITERAL);
903 	testNumeric("0b1_0", TT.INT_BIN_LITERAL);
904 
905 	{
906 		string source = "/*\n*/test";
907 		Lexer lexer = makeLexer(source);
908 		lexer.lex;
909 		assert(tokenBuffer[0] == TT.COMMENT);
910 		assert(ctx.tokenLocationBuffer[0].getTokenString(source) == "/*\n*/", format("%s", ctx.tokenLocationBuffer[0]));
911 		assert(tokenBuffer[1] == TT.IDENTIFIER);
912 		assert(ctx.tokenLocationBuffer[1].getTokenString(source) == "test");
913 	}
914 	{
915 		string source = "//test\nhello";
916 		Lexer lexer = makeLexer(source);
917 		lexer.lex;
918 		assert(tokenBuffer[0] == TT.COMMENT);
919 		assert(ctx.tokenLocationBuffer[0].getTokenString(source) == "//test\n");
920 		assert(tokenBuffer[1] == TT.IDENTIFIER);
921 		assert(ctx.tokenLocationBuffer[1].getTokenString(source) == "hello");
922 	}
923 	{
924 		string source = `"literal"`;
925 		Lexer lexer = makeLexer(source);
926 		lexer.lex;
927 		assert(tokenBuffer[0] == TT.STRING_LITERAL);
928 		assert(ctx.tokenLocationBuffer[0].getTokenString(source) == `"literal"`, format("%s", tokenBuffer[0]));
929 	}
930 	{
931 		string source = `'@'`;
932 		Lexer lexer = makeLexer(source);
933 		lexer.lex;
934 		assert(tokenBuffer[0] == TT.CHAR_LITERAL);
935 		assert(ctx.tokenLocationBuffer[0].getTokenString(source) == `'@'`, format("%s", tokenBuffer[0]));
936 	}
937 }