1 /**
2 Copyright: Copyright (c) 2018-2019 Andrey Penechko.
3 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0).
4 Authors: Andrey Penechko.
5 */
6 
7 module vox.be.emit_mc_amd64;
8 
9 import std.stdio;
10 
11 import vox.all;
12 import vox.be.amd64asm;
13 
14 /// Emits machine code for amd64 architecture
15 void pass_emit_mc_amd64(ref CompilationContext context, CompilePassPerModule[] subPasses)
16 {
17 	auto emitter = CodeEmitter(&context);
18 
19 	// emit code
20 	foreach (ref SourceFileInfo file; context.files.data) {
21 		emitter.compileModule(file.mod);
22 	}
23 
24 	fillStaticDataSections(&context);
25 
26 	if (context.printStaticData) {
27 		writefln("// RW data: addr 0x%X, %s bytes",
28 			context.staticDataBuffer.bufPtr,
29 			context.staticDataBuffer.length);
30 		printHex(context.staticDataBuffer.data, 16);
31 		writefln("// RO data: addr 0x%X, %s bytes",
32 			context.roStaticDataBuffer.bufPtr,
33 			context.roStaticDataBuffer.length);
34 		printHex(context.roStaticDataBuffer.data, 16);
35 	}
36 }
37 
38 // Arranges static data inside static data sections
39 void fillStaticDataSections(CompilationContext* c)
40 {
41 	// copy initialized static data into buffer and set offsets
42 	foreach(size_t i, ref IrGlobal global; c.globals.buffer.data)
43 	{
44 		ObjectSymbol* globalSym = c.objSymTab.getSymbol(global.objectSymIndex);
45 		if (globalSym.isAllZero) continue;
46 
47 		ObjectSection* symSection = c.objSymTab.getSection(globalSym.sectionIndex);
48 
49 		if (symSection.buffer.contains(globalSym.initializer.ptr)) {
50 			// If data is in we assume that zero termination was handled as needed
51 			globalSym.sectionOffset = cast(uint)(globalSym.initializer.ptr - symSection.buffer.bufPtr);
52 			continue;
53 		}
54 
55 		// alignment
56 		uint padding = paddingSize!uint(cast(uint)symSection.buffer.length, globalSym.alignment);
57 		symSection.buffer.pad(padding);
58 
59 		// offset
60 		globalSym.sectionOffset = cast(uint)symSection.buffer.length;
61 
62 		// copy data
63 		c.assertf(globalSym.dataPtr !is null, "null initializer");
64 		symSection.buffer.put(globalSym.initializer);
65 
66 		// zero termination
67 		if (globalSym.needsZeroTermination) symSection.buffer.put(0);
68 		//writefln("Global %s, size %s, zero %s, offset %s, buf size %s",
69 		//	globalSym.initializer, globalSym.length, globalSym.needsZeroTermination, globalSym.sectionOffset, symSection.buffer.length);
70 	}
71 
72 	uint zeroDataOffset = cast(uint)c.staticDataBuffer.length;
73 	LinkIndex rwSectionIndex = c.builtinSections[ObjectSectionType.rw_data];
74 
75 	// second pass for zero initialized data
76 	foreach(size_t i, ref IrGlobal global; c.globals.buffer.data)
77 	{
78 		ObjectSymbol* globalSym = c.objSymTab.getSymbol(global.objectSymIndex);
79 		if (!globalSym.isAllZero) continue;
80 
81 		c.assertf(globalSym.sectionIndex == rwSectionIndex, "Cannot have zero-initialized data in sections other than RW");
82 
83 		// alignment
84 		uint padding = paddingSize!uint(zeroDataOffset, globalSym.alignment);
85 		zeroDataOffset += padding;
86 
87 		// offset
88 		globalSym.sectionOffset = zeroDataOffset;
89 
90 		// copy data
91 		zeroDataOffset += globalSym.length;
92 
93 		// zero termination
94 		if (globalSym.needsZeroTermination) ++zeroDataOffset;
95 	}
96 
97 	ObjectSection* rwSection = c.objSymTab.getSection(c.builtinSections[ObjectSectionType.rw_data]);
98 	c.zeroDataLength = zeroDataOffset - cast(uint)c.staticDataBuffer.length;
99 	rwSection.zeroDataLength = c.zeroDataLength;
100 }
101 
102 //version = emit_mc_print;
103 
104 struct CodeEmitter
105 {
106 	CompilationContext* context;
107 
108 	FunctionDeclNode* fun;
109 	IrFunction* lir;
110 	CodeGen_x86_64 gen;
111 	PC[] blockStarts;
112 	PC[2][] jumpFixups;
113 	int stackPointerExtraOffset;
114 	IrIndex stackPointer;
115 
116 	void compileModule(ModuleDeclNode* mod)
117 	{
118 		ubyte* codeStart = context.codeBuffer.nextPtr;
119 		gen.encoder.setBuffer(&context.codeBuffer);
120 
121 		foreach(funcIndex; mod.functions) {
122 			FunctionDeclNode* f = context.getAst!FunctionDeclNode(funcIndex);
123 
124 			if (f.isExternal) continue;
125 			compileFunction(f);
126 		}
127 
128 		ubyte[] code = codeStart[0..context.codeBuffer.nextPtr-codeStart];
129 
130 		if (context.printCodeHex && context.printDumpOfAll) {
131 			writefln("// Amd64 code: addr 0x%X, %s bytes", code.ptr, code.length);
132 			printHex(code, 16);
133 			writeln;
134 		}
135 	}
136 
137 	void compileFunction(FunctionDeclNode* f)
138 	{
139 		context.currentFunction = f;
140 		scope(exit) context.currentFunction = null;
141 
142 		fun = f;
143 		lir = context.getAst!IrFunction(fun.backendData.lirData);
144 
145 		ObjectSymbol* funcSym = context.objSymTab.getSymbol(fun.backendData.objectSymIndex);
146 		funcSym.dataPtr = gen.pc;
147 		funcSym.sectionOffset = cast(ulong)(gen.pc - context.codeBuffer.bufPtr);
148 
149 		if (context.buildType == BuildType.exe && fun.id == CommonIds.id_main)
150 		{
151 			if (context.entryPoint !is null)
152 			{
153 				context.unrecoverable_error(fun.loc, "Multiple entry points: %s, %s", fun.loc, context.entryPoint.loc);
154 			}
155 
156 			context.entryPoint = fun;
157 
158 			if (context.targetOs == TargetOs.linux) {
159 				// On Linux entry point is aligned to 16 bytes, but we assume 16 byte alignment + 8 bytes from call istruction
160 				// section 3.4.1 of AMD64 ABI 1.0 says:
161 				//  `rsp`: The stack pointer holds the address of the byte with lowest address which is part of
162 				//         the stack. It is guaranteed to be 16-byte aligned at process entry
163 				// https://stackoverflow.com/questions/26866723/main-and-stack-alignment
164 				gen.subq(Register.SP, Imm8(8));
165 			}
166 		}
167 
168 		stackPointer = IrIndex(lir.getCallConv(context).stackPointer, ArgType.QWORD);
169 
170 		blockStarts = cast(PC[])context.tempBuffer.voidPut(lir.numBasicBlocks * (PC.sizeof / uint.sizeof));
171 
172 		uint[] buf = context.tempBuffer.voidPut(lir.numBasicBlocks * 2 * (PC.sizeof / uint.sizeof)); // TODO: free mem
173 		// buf[] = 0; //zeroing is not needed, because both slots are correctly filled by jump instruction emitters
174 		jumpFixups = cast(PC[2][])buf;
175 
176 		compileFuncProlog();
177 		compileBody();
178 		fixJumps();
179 
180 		funcSym.length = cast(uint)(gen.pc - funcSym.dataPtr);
181 
182 		if (context.printCodeHex && context.printDumpOnlyOf(f)) {
183 			writefln("// Amd64 code: %s addr 0x%X, %s bytes", context.idString(f.id), funcSym.dataPtr, funcSym.length);
184 			printHex(funcSym.dataPtr[0..funcSym.length], 16);
185 		}
186 	}
187 
188 	void compileFuncProlog()
189 	{
190 		uint reservedBytes = lir.stackFrameSize;
191 
192 		// frame pointer is stored with a push, so don't allocate space for it
193 		if (context.useFramePointer) {
194 			context.assertf(reservedBytes >= STACK_ITEM_SIZE, "bug");
195 			reservedBytes -= STACK_ITEM_SIZE;
196 		}
197 
198 		// Establish frame pointer
199 		if (context.useFramePointer)
200 		{
201 			gen.pushq(Register.BP);
202 			gen.movq(Register.BP, Register.SP);
203 		}
204 
205 		if (reservedBytes) // Reserve space for locals
206 		{
207 			if (reservedBytes > byte.max) gen.subq(Register.SP, Imm32(reservedBytes));
208 			else gen.subq(Register.SP, Imm8(cast(byte)reservedBytes));
209 		}
210 	}
211 
212 	void compileFuncEpilog()
213 	{
214 		uint reservedBytes = lir.stackFrameSize;
215 
216 		// frame pointer is stored with a push, so don't allocate space for it
217 		if (context.useFramePointer) {
218 			context.assertf(reservedBytes >= STACK_ITEM_SIZE, "bug");
219 			reservedBytes -= STACK_ITEM_SIZE;
220 		}
221 
222 		if (reservedBytes)
223 		{
224 			if (reservedBytes > byte.max) gen.addq(Register.SP, Imm32(reservedBytes));
225 			else gen.addq(Register.SP, Imm8(cast(byte)reservedBytes));
226 		}
227 
228 		if (context.useFramePointer)
229 		{
230 			// Restore frame pointer
231 			gen.popq(Register.BP);
232 		}
233 
234 		gen.ret();
235 	}
236 
237 	uint referenceOffset()
238 	{
239 		ObjectSymbol* funcSym = context.objSymTab.getSymbol(fun.backendData.objectSymIndex);
240 		ptrdiff_t diff = gen.pc - funcSym.dataPtr;
241 		context.assertf(diff >= 0, "Negative buffer position");
242 		context.assertf(diff <= uint.max, "Function is bigger than uint.max");
243 		return cast(uint)diff;
244 	}
245 
246 	// successorBIndex is 0 or 1
247 	void genJumpToSuccessors(ref IrBasicBlock fromBlock, ubyte successorBIndex, PC successorA = null)
248 	{
249 		if (fromBlock.seqIndex + 1 != lir.getBlock(fromBlock.successors[successorBIndex, lir]).seqIndex) {
250 			gen.jmp(Imm32(0));
251 			jumpFixups[fromBlock.seqIndex][successorBIndex] = gen.pc;
252 		} else {
253 			// zero out the successor fixup
254 			jumpFixups[fromBlock.seqIndex][successorBIndex] = null;
255 		}
256 		// zero out the other fixup
257 		jumpFixups[fromBlock.seqIndex][1 - successorBIndex] = successorA;
258 	}
259 
260 	void compileBody()
261 	{
262 		lir.assignSequentialBlockIndices();
263 
264 		foreach (IrIndex lirBlockIndex, ref IrBasicBlock lirBlock; lir.blocks)
265 		{
266 			blockStarts[lirBlock.seqIndex] = gen.pc;
267 			stackPointerExtraOffset = 0;
268 			foreach(IrIndex instrIndex, ref IrInstrHeader instrHeader; lirBlock.instructions(lir))
269 			{
270 				switch(cast(Amd64Opcode)instrHeader.op)
271 				{
272 					case Amd64Opcode.mov:
273 						genMove(instrHeader.result(lir), instrHeader.arg(lir, 0));
274 						break;
275 					case Amd64Opcode.xchg:
276 						IrIndex arg0 = instrHeader.arg(lir, 0);
277 						IrIndex arg1 = instrHeader.arg(lir, 1);
278 						context.assertf(arg1.isPhysReg, "%s is not phys reg", arg1);
279 						context.assertf(arg0.isPhysReg, "%s is not phys reg", arg0);
280 						context.assertf(arg0.physRegSize == arg1.physRegSize,
281 							"%s:%s reg size mismatch %s != %s", lirBlockIndex, instrIndex, arg0.physRegSize, arg1.physRegSize);
282 						context.assertf(arg0.physRegClass == arg1.physRegClass && arg0.physRegClass == AMD64_REG_CLASS.GPR, "Only GPR xchg is implemented");
283 						Register dst = indexToRegister(arg0);
284 						Register src = indexToRegister(arg1);
285 						gen.xchg(dst, src, cast(ArgType)arg0.physRegSize);
286 						break;
287 					case Amd64Opcode.load:
288 						genLoad(instrHeader.result(lir), instrHeader.arg(lir, 0));
289 						break;
290 					case Amd64Opcode.store:
291 						genStore(instrHeader.arg(lir, 0), instrHeader.arg(lir, 1), instrHeader.argSize);
292 						break;
293 					case Amd64Opcode.add:
294 						IrIndex arg0 = instrHeader.arg(lir, 0);
295 						IrIndex arg1 = instrHeader.arg(lir, 1);
296 						if (arg1.isStackSlot)
297 						{
298 							// this was generated from GEP
299 							//   reg += rsp + disp8/32
300 							// convert it into
301 							//   lea reg, rsp + reg + disp8/32
302 							Register dst = indexToRegister(arg0);
303 							MemAddress addr = localVarMemAddress(arg1);
304 							switch(addr.type) {
305 								case MemAddrType.baseDisp8:
306 									MemAddress newAddr = memAddrBaseIndexDisp8(addr.baseReg, dst, SibScale(0), addr.disp8.value);
307 									gen.lea(dst, newAddr, ArgType.QWORD);
308 									break;
309 								case MemAddrType.baseDisp32:
310 									MemAddress newAddr = memAddrBaseIndexDisp32(addr.baseReg, dst, SibScale(0), addr.disp32.value);
311 									gen.lea(dst, newAddr, ArgType.QWORD);
312 									break;
313 								default:
314 									context.internal_error("Invalid memory operand %s", addr);
315 							}
316 						}
317 						else
318 						{
319 							genRegular(arg0, arg1, AMD64OpRegular.add, cast(IrArgSize)arg0.physRegSize, instrIndex);
320 						}
321 
322 						if (arg0 == stackPointer) {
323 							if (arg1.isSimpleConstant) {
324 								stackPointerExtraOffset -= context.constants.get(arg1).i64;
325 							} else {
326 								dumpFunction(context, lir, "Code gen");
327 								context.internal_error("Cannot decrement stack pointer by non-constant in %s", instrIndex);
328 							}
329 						}
330 						break;
331 					case Amd64Opcode.sub:
332 						IrIndex arg0 = instrHeader.arg(lir, 0);
333 						IrIndex arg1 = instrHeader.arg(lir, 1);
334 						genRegular(arg0, arg1, AMD64OpRegular.sub, cast(IrArgSize)arg0.physRegSize, instrIndex);
335 						if (arg0 == stackPointer) {
336 							if (arg1.isSimpleConstant) {
337 								stackPointerExtraOffset += context.constants.get(arg1).i64;
338 							} else {
339 								dumpFunction(context, lir, "Code gen");
340 								context.internal_error("Cannot increment stack pointer by non-constant in %s", instrIndex);
341 							}
342 						}
343 						break;
344 					case Amd64Opcode.xor:
345 						genRegular(instrHeader.arg(lir, 0), instrHeader.arg(lir, 1), AMD64OpRegular.xor, cast(IrArgSize)instrHeader.arg(lir, 0).physRegSize, instrIndex);
346 						break;
347 					case Amd64Opcode.or:
348 						genRegular(instrHeader.arg(lir, 0), instrHeader.arg(lir, 1), AMD64OpRegular.or, cast(IrArgSize)instrHeader.arg(lir, 0).physRegSize, instrIndex);
349 						break;
350 					case Amd64Opcode.and:
351 						genRegular(instrHeader.arg(lir, 0), instrHeader.arg(lir, 1), AMD64OpRegular.and, cast(IrArgSize)instrHeader.arg(lir, 0).physRegSize, instrIndex);
352 						break;
353 					case Amd64Opcode.imul:
354 						context.assertf(instrHeader.arg(lir, 0).isPhysReg, "%s is not phys reg", instrHeader.arg(lir, 0));
355 						Register dst = indexToRegister(instrHeader.arg(lir, 0));
356 						switch(instrHeader.arg(lir, 1).kind) with(IrValueKind) {
357 							case constant:
358 								IrConstant con = context.constants.get(instrHeader.arg(lir, 1));
359 								gen.imulq(dst, dst, Imm32(con.i32));
360 								break;
361 							case physicalRegister:
362 								Register src = indexToRegister(instrHeader.arg(lir, 1));
363 								gen.imul(dst, src, cast(ArgType)instrHeader.arg(lir, 0).physRegSize);
364 								break;
365 							default:
366 								context.internal_error("imul %s not implemented", instrHeader.args(lir));
367 						}
368 						break;
369 					case Amd64Opcode.div:
370 						Register divisor = indexToRegister(instrHeader.arg(lir, 2));
371 						gen.div(divisor, cast(ArgType)instrHeader.arg(lir, 2).physRegSize);
372 						break;
373 					case Amd64Opcode.idiv:
374 						Register divisor = indexToRegister(instrHeader.arg(lir, 2));
375 						gen.idiv(divisor, cast(ArgType)instrHeader.arg(lir, 2).physRegSize);
376 						break;
377 
378 					case Amd64Opcode.fadd:
379 						IrIndex arg0 = instrHeader.arg(lir, 0);
380 						IrIndex arg1 = instrHeader.arg(lir, 1);
381 						context.assertf(arg0.physRegClass == AMD64_REG_CLASS.XMM, "fadd reg class %s != XMM", arg0.physRegClass);
382 						context.assertf(arg1.physRegClass == AMD64_REG_CLASS.XMM, "fadd reg class %s != XMM", arg1.physRegClass);
383 						final switch(instrHeader.argSize) with(IrArgSize) {
384 							case size32: gen.addss(indexToRegister(arg0), indexToRegister(arg1)); break;
385 							case size64: gen.addsd(indexToRegister(arg0), indexToRegister(arg1)); break;
386 							case size8, size16, size128, size256, size512: context.internal_error("fadd %s", instrHeader.argSize);
387 						}
388 						break;
389 					case Amd64Opcode.fsub:
390 						IrIndex arg0 = instrHeader.arg(lir, 0);
391 						IrIndex arg1 = instrHeader.arg(lir, 1);
392 						context.assertf(arg0.physRegClass == AMD64_REG_CLASS.XMM, "fsub reg class %s != XMM", arg0.physRegClass);
393 						context.assertf(arg1.physRegClass == AMD64_REG_CLASS.XMM, "fsub reg class %s != XMM", arg1.physRegClass);
394 						final switch(instrHeader.argSize) with(IrArgSize) {
395 							case size32: gen.subss(indexToRegister(arg0), indexToRegister(arg1)); break;
396 							case size64: gen.subsd(indexToRegister(arg0), indexToRegister(arg1)); break;
397 							case size8, size16, size128, size256, size512: context.internal_error("fsub %s", instrHeader.argSize);
398 						}
399 						break;
400 					case Amd64Opcode.fdiv:
401 						IrIndex arg0 = instrHeader.arg(lir, 0);
402 						IrIndex arg1 = instrHeader.arg(lir, 1);
403 						context.assertf(arg0.physRegClass == AMD64_REG_CLASS.XMM, "fdiv reg class %s != XMM", arg0.physRegClass);
404 						context.assertf(arg1.physRegClass == AMD64_REG_CLASS.XMM, "fdiv reg class %s != XMM", arg1.physRegClass);
405 						final switch(instrHeader.argSize) with(IrArgSize) {
406 							case size32: gen.divss(indexToRegister(arg0), indexToRegister(arg1)); break;
407 							case size64: gen.divsd(indexToRegister(arg0), indexToRegister(arg1)); break;
408 							case size8, size16, size128, size256, size512: context.internal_error("fdiv %s", instrHeader.argSize);
409 						}
410 						break;
411 					case Amd64Opcode.fmul:
412 						IrIndex arg0 = instrHeader.arg(lir, 0);
413 						IrIndex arg1 = instrHeader.arg(lir, 1);
414 						context.assertf(arg0.physRegClass == AMD64_REG_CLASS.XMM, "fmul reg class %s != XMM", arg0.physRegClass);
415 						context.assertf(arg1.physRegClass == AMD64_REG_CLASS.XMM, "fmul reg class %s != XMM", arg1.physRegClass);
416 						final switch(instrHeader.argSize) with(IrArgSize) {
417 							case size32: gen.mulss(indexToRegister(arg0), indexToRegister(arg1)); break;
418 							case size64: gen.mulsd(indexToRegister(arg0), indexToRegister(arg1)); break;
419 							case size8, size16, size128, size256, size512: context.internal_error("fmul %s", instrHeader.argSize);
420 						}
421 						break;
422 
423 					case Amd64Opcode.movzx_btow: gen.movzx_btow(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
424 					case Amd64Opcode.movzx_btod: gen.movzx_btod(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
425 					case Amd64Opcode.movzx_btoq: gen.movzx_btoq(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
426 					case Amd64Opcode.movzx_wtod: gen.movzx_wtod(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
427 					case Amd64Opcode.movzx_wtoq: gen.movzx_wtoq(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
428 					case Amd64Opcode.movsx_btow: gen.movsx_btow(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
429 					case Amd64Opcode.movsx_btod: gen.movsx_btod(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
430 					case Amd64Opcode.movsx_btoq: gen.movsx_btoq(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
431 					case Amd64Opcode.movsx_wtod: gen.movsx_wtod(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
432 					case Amd64Opcode.movsx_wtoq: gen.movsx_wtoq(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
433 					case Amd64Opcode.movsx_dtoq: gen.movsx_dtoq(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
434 					case Amd64Opcode.f32_to_f64: gen.cvtss2sd(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
435 					case Amd64Opcode.f64_to_f32: gen.cvtsd2ss(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
436 
437 					case Amd64Opcode.i32_to_f32: gen.cvtsid2ss(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
438 					case Amd64Opcode.i64_to_f32: gen.cvtsiq2ss(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
439 					case Amd64Opcode.i32_to_f64: gen.cvtsid2sd(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
440 					case Amd64Opcode.i64_to_f64: gen.cvtsiq2sd(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
441 
442 					case Amd64Opcode.f32_to_i32_trunc: gen.cvttss2sid(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
443 					case Amd64Opcode.f32_to_i64_trunc: gen.cvttss2siq(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
444 					case Amd64Opcode.f64_to_i32_trunc: gen.cvttsd2sid(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
445 					case Amd64Opcode.f64_to_i64_trunc: gen.cvttsd2siq(indexToRegister(instrHeader.result(lir)), indexToRegister(instrHeader.arg(lir, 0))); break;
446 
447 					case Amd64Opcode.rep_stos: gen.rep_prefix; gen.stos; break;
448 					case Amd64Opcode.divsx:
449 						final switch(instrHeader.argSize) {
450 							case IrArgSize.size8: gen.movsx_btow(Register.AX, Register.AX); break;
451 							case IrArgSize.size16: gen.cwd; break;
452 							case IrArgSize.size32: gen.cdq; break;
453 							case IrArgSize.size64: gen.cqo; break;
454 							case IrArgSize.size128, IrArgSize.size256, IrArgSize.size512: context.internal_error("divsx %s", instrHeader.argSize);
455 						}
456 						break;
457 					case Amd64Opcode.shl:
458 						Register dst = indexToRegister(instrHeader.arg(lir, 0));
459 						IrIndex src = instrHeader.arg(lir, 1);
460 						if (src.isSimpleConstant) {
461 							IrConstant con = context.constants.get(instrHeader.arg(lir, 1));
462 							if (con.i8 == 1)
463 								gen.shl1(dst, cast(ArgType)instrHeader.argSize);
464 							else
465 								gen.shli(dst, Imm8(con.i8), cast(ArgType)instrHeader.argSize);
466 						}
467 						else
468 							gen.shl(dst, cast(ArgType)instrHeader.argSize);
469 						break;
470 					case Amd64Opcode.shr:
471 						Register dst = indexToRegister(instrHeader.arg(lir, 0));
472 						IrIndex src = instrHeader.arg(lir, 1);
473 						if (src.isSimpleConstant) {
474 							IrConstant con = context.constants.get(instrHeader.arg(lir, 1));
475 							if (con.i8 == 1)
476 								gen.shr1(dst, cast(ArgType)instrHeader.argSize);
477 							else
478 								gen.shri(dst, Imm8(con.i8), cast(ArgType)instrHeader.argSize);
479 						}
480 						else
481 							gen.shr(dst, cast(ArgType)instrHeader.argSize);
482 						break;
483 					case Amd64Opcode.sar:
484 						Register dst = indexToRegister(instrHeader.arg(lir, 0));
485 						IrIndex src = instrHeader.arg(lir, 1);
486 						if (src.isSimpleConstant) {
487 							IrConstant con = context.constants.get(instrHeader.arg(lir, 1));
488 							if (con.i8 == 1)
489 								gen.sar1(dst, cast(ArgType)instrHeader.argSize);
490 							else
491 								gen.sari(dst, Imm8(con.i8), cast(ArgType)instrHeader.argSize);
492 						}
493 						else
494 							gen.sar(dst, cast(ArgType)instrHeader.argSize);
495 						break;
496 					case Amd64Opcode.not:
497 						Register dst = indexToRegister(instrHeader.arg(lir, 0));
498 						gen.not(dst, cast(ArgType)instrHeader.arg(lir, 0).physRegSize);
499 						break;
500 					case Amd64Opcode.neg:
501 						Register dst = indexToRegister(instrHeader.arg(lir, 0));
502 						gen.neg(dst, cast(ArgType)instrHeader.arg(lir, 0).physRegSize);
503 						break;
504 					case Amd64Opcode.fneg:
505 						IrIndex arg0 = instrHeader.arg(lir, 0);
506 						context.assertf(arg0.physRegClass == AMD64_REG_CLASS.XMM, "incorrect class %s, xmm expected", arg0.physRegClass);
507 						switch(arg0.physRegSize) {
508 							case IrArgSize.size32:
509 								MemAddress addr = memAddrRipDisp32(0);
510 								gen.xorps(indexToRegister(arg0), addr);
511 								IrIndex sign_bit_global = context.globals.get_or_add_f32_sign_bit_constant(context);
512 								addRefTo(sign_bit_global);
513 								break;
514 							case IrArgSize.size64:
515 								MemAddress addr = memAddrRipDisp32(0);
516 								gen.xorpd(indexToRegister(arg0), addr);
517 								IrIndex sign_bit_global = context.globals.get_or_add_f64_sign_bit_constant(context);
518 								addRefTo(sign_bit_global);
519 								break;
520 							default: context.internal_error("fneg %s", arg0.physRegSize);
521 						}
522 						break;
523 					case Amd64Opcode.call:
524 						IrIndex calleeIndex = instrHeader.arg(lir, 0);
525 
526 						if (calleeIndex.isFunction)
527 						{
528 							// direct call by name
529 							FunctionDeclNode* callee = context.getFunction(calleeIndex);
530 							ObjectSymbol* sym = context.objSymTab.getSymbol(callee.backendData.objectSymIndex);
531 
532 							if (sym.isIndirect)
533 								gen.call(memAddrRipDisp32(0)); // read address from import section
534 							else
535 								gen.call(Imm32(0)); // call relative to next instruction
536 
537 							addRefTo(calleeIndex);
538 						}
539 						else
540 						{
541 							// call by ptr
542 							if (calleeIndex.isStackSlot)
543 							{
544 								MemAddress addr = localVarMemAddress(calleeIndex);
545 								gen.call(addr);
546 							}
547 							else
548 							{
549 								Register calleePtr = indexToRegister(calleeIndex);
550 								gen.call(calleePtr);
551 							}
552 						}
553 						break;
554 					case Amd64Opcode.syscall:
555 						gen.syscall();
556 						break;
557 					case Amd64Opcode.jmp:
558 						genJumpToSuccessors(lirBlock, 0);
559 						break;
560 					case Amd64Opcode.bin_branch:
561 						IrIndex arg0 = instrHeader.arg(lir, 0);
562 						IrIndex arg1 = instrHeader.arg(lir, 1);
563 						auto cond = cast(IrBinaryCondition)instrHeader.cond;
564 
565 						if (arg0.isSimpleConstant)
566 						{
567 							if (arg1.isSimpleConstant)
568 							{
569 								if (evalBinCondition(*context, cond, arg0, arg1))
570 									genJumpToSuccessors(lirBlock, 0);
571 								else
572 									genJumpToSuccessors(lirBlock, 1);
573 								break;
574 							}
575 
576 							// move const to the right
577 							// TODO: perform canonicalization in middle-end
578 							swap(arg0, arg1);
579 							cond = invertBinaryCond(cond);
580 						}
581 
582 						if (arg0.physRegClass == AMD64_REG_CLASS.XMM) {
583 							assert(arg1.physRegClass == AMD64_REG_CLASS.XMM);
584 							final switch(instrHeader.argSize) with(IrArgSize) {
585 								case size32: gen.ucomiss(indexToRegister(arg0), indexToRegister(arg1)); break;
586 								case size64: gen.ucomisd(indexToRegister(arg0), indexToRegister(arg1)); break;
587 								case size8, size16, size128, size256, size512: context.internal_error("bin_branch %s", instrHeader.argSize);
588 							}
589 						} else {
590 							genRegular(arg0, arg1, AMD64OpRegular.cmp, cast(IrArgSize)instrHeader.argSize, instrIndex);
591 						}
592 
593 						Condition mach_cond = IrBinCondToAmd64Condition[cond];
594 						gen.jcc(mach_cond, Imm32(0));
595 						genJumpToSuccessors(lirBlock, 1, gen.pc);
596 						break;
597 					case Amd64Opcode.un_branch:
598 						if (instrHeader.arg(lir, 0).isSimpleConstant)
599 						{
600 							IrConstant con = context.constants.get(instrHeader.arg(lir, 0));
601 							if (con.i64 && instrHeader.cond == IrUnaryCondition.not_zero ||
602 								(!con.i64) && instrHeader.cond == IrUnaryCondition.zero)
603 								genJumpToSuccessors(lirBlock, 0);
604 							else
605 								genJumpToSuccessors(lirBlock, 1);
606 							break;
607 						}
608 						Register reg = indexToRegister(instrHeader.arg(lir, 0));
609 						gen.test(reg, reg, cast(ArgType)instrHeader.arg(lir, 0).physRegSize);
610 						Condition cond = IrUnCondToAmd64Condition[instrHeader.cond];
611 						gen.jcc(cond, Imm32(0));
612 						genJumpToSuccessors(lirBlock, 1, gen.pc);
613 						break;
614 					case Amd64Opcode.set_unary_cond:
615 						Register reg = indexToRegister(instrHeader.arg(lir, 0));
616 						gen.test(reg, reg, cast(ArgType)instrHeader.arg(lir, 0).physRegSize);
617 						Condition cond = IrUnCondToAmd64Condition[instrHeader.cond];
618 						Register dst = indexToRegister(instrHeader.result(lir));
619 						gen.setcc(cond, dst);
620 						break;
621 					case Amd64Opcode.set_binary_cond:
622 						IrIndex arg0 = instrHeader.arg(lir, 0);
623 						IrIndex arg1 = instrHeader.arg(lir, 1);
624 						Condition cond = IrBinCondToAmd64Condition[instrHeader.cond];
625 						if (arg0.physRegClass == AMD64_REG_CLASS.XMM) {
626 							assert(arg1.physRegClass == AMD64_REG_CLASS.XMM);
627 							switch(arg0.physRegSize) {
628 								case IrArgSize.size32: gen.ucomiss(indexToRegister(arg0), indexToRegister(arg1)); break;
629 								case IrArgSize.size64: gen.ucomisd(indexToRegister(arg0), indexToRegister(arg1)); break;
630 								default: context.internal_error("set_binary_cond %s", arg0.physRegSize);
631 							}
632 						} else {
633 							genRegular(arg0, arg1, AMD64OpRegular.cmp, cast(IrArgSize)arg0.physRegSize, instrIndex);
634 						}
635 						Register dst = indexToRegister(instrHeader.result(lir));
636 						gen.setcc(cond, dst);
637 						break;
638 					case Amd64Opcode.ret:
639 						jumpFixups[lirBlock.seqIndex][0] = null;
640 						jumpFixups[lirBlock.seqIndex][1] = null;
641 						compileFuncEpilog();
642 						break;
643 					case Amd64Opcode.ud2:
644 						jumpFixups[lirBlock.seqIndex][0] = null;
645 						jumpFixups[lirBlock.seqIndex][1] = null;
646 						gen.ud2;
647 						break;
648 					case Amd64Opcode.push:
649 						IrIndex src = instrHeader.arg(lir, 0);
650 						switch (src.kind) with(IrValueKind)
651 						{
652 							case constant, constantZero:
653 								IrConstant con = context.constants.get(src);
654 								gen.pushd(Imm32(con.i32));
655 								break;
656 
657 							case physicalRegister:
658 								Register reg = indexToRegister(src);
659 								gen.pushq(reg);
660 								break;
661 
662 							// those wont push the address itself, but memory contents
663 							/*case stackSlot:
664 								MemAddress addr = localVarMemAddress(src);
665 								gen.pushq(addr);
666 								break;
667 
668 							case global, func:
669 								MemAddress addr = memAddrRipDisp32(0);
670 								gen.pushq(addr);
671 								addRefTo(src);
672 								break;*/
673 
674 							default:
675 								context.internal_error("Cannot encode %s %s in %s %s",
676 									cast(Amd64Opcode)instrHeader.op, src, context.idString(lir.name), instrIndex);
677 						}
678 						stackPointerExtraOffset += STACK_ITEM_SIZE;
679 						break;
680 					default:
681 						context.internal_error("Unimplemented instruction `%s`", cast(Amd64Opcode)instrHeader.op);
682 				}
683 			}
684 
685 			if (stackPointerExtraOffset != 0) {
686 				// When we call noreturn function stack cleanup is omitted
687 				// After such calls we expect unreachable
688 				if (lir.getInstr(lirBlock.lastInstr).op != Amd64Opcode.ud2)
689 					context.internal_error("Unmatched stack size modification");
690 			}
691 		}
692 	}
693 
694 	void addRefTo(IrIndex entity, short offset = 4)
695 	{
696 		LinkIndex entityIndex;
697 		switch (entity.kind) with(IrValueKind)
698 		{
699 			case global:
700 				IrGlobal* global = context.globals.get(entity);
701 				entityIndex = global.objectSymIndex;
702 				break;
703 
704 			case func:
705 				FunctionDeclNode* func = context.getFunction(entity);
706 				entityIndex = func.backendData.objectSymIndex;
707 				break;
708 
709 			default:
710 				context.internal_error("addRefTo %s %s", entity, offset);
711 		}
712 
713 		addRefTo(entityIndex, offset);
714 	}
715 
716 	void addRefTo(LinkIndex entityIndex, short offset = 4)
717 	{
718 		ObjectSymbolReference r = {
719 			fromSymbol : fun.backendData.objectSymIndex,
720 			referencedSymbol : entityIndex,
721 			refOffset : referenceOffset() - offset,
722 			offset,
723 			ObjectSymbolRefKind.relative32,
724 		};
725 		context.objSymTab.addReference(r);
726 	}
727 
728 	void fixJump(PC fixup, lazy IrIndex targetBlock)
729 	{
730 		PC succPC = blockStarts[lir.getBlock(targetBlock).seqIndex];
731 		fix_PC_REL_32(fixup, succPC);
732 	}
733 
734 	void fixJumps()
735 	{
736 		foreach (IrIndex lirBlockIndex, ref IrBasicBlock lirBlock; lir.blocks)
737 		{
738 			PC[2] fixups = jumpFixups[lirBlock.seqIndex];
739 			if (fixups[0] !is null) fixJump(fixups[0], lirBlock.successors[0, lir]);
740 			if (fixups[1] !is null) fixJump(fixups[1], lirBlock.successors[1, lir]);
741 		}
742 	}
743 
744 	MemAddress localVarMemAddress(IrIndex stackSlotIndex) {
745 		context.assertf(stackSlotIndex.isStackSlot, "Index is not stack slot, but %s", stackSlotIndex.kind);
746 		auto stackSlot = lir.getStackSlot(stackSlotIndex);
747 		Register baseReg = indexToRegister(stackSlot.baseReg);
748 		return minMemAddrBaseDisp(baseReg, stackSlot.displacement + stackPointerExtraOffset);
749 	}
750 
751 	Register indexToRegister(IrIndex regIndex) {
752 		context.assertf(regIndex.isPhysReg, "Index is not register, but %s %s", regIndex.kind, regIndex);
753 		return cast(Register)regIndex.physRegIndex;
754 	}
755 
756 	void genRegular(IrIndex dst, IrIndex src, AMD64OpRegular op, IrArgSize argSize, IrIndex instrIndex)
757 	{
758 		AsmArg argDst;
759 		AsmArg argSrc;
760 		AsmOpParam param;
761 		param.op = op;
762 		param.argType = cast(ArgType)argSize;
763 
764 		argDst.reg = indexToRegister(dst);
765 
766 		// HACK, TODO: ESP is generated instead of RSP. Need to store types in instructions / more instruction types
767 		if (argDst.reg == Register.SP) param.argType = ArgType.QWORD;
768 
769 		param.dstKind = AsmArgKind.REG;
770 
771 		//writefln("%s.%s %s %s", op, param.argType, dst.kind, src.kind);
772 
773 		final switch (src.kind) with(IrValueKind)
774 		{
775 			case none, array, instruction, basicBlock, phi, type, virtualRegister, variable, func, constantAggregate: context.internal_error("divsx src %s", src.kind);
776 			case constantZero:
777 			case constant:
778 				IrConstant con = context.constants.get(src);
779 				if (con.i64.argSizeIntSigned == IrArgSize.size8) {
780 					param.immType = ArgType.BYTE;
781 					argSrc.imm8 = Imm8(con.i8);
782 				}
783 				else if (argSize == IrArgSize.size16) {
784 					param.immType = ArgType.WORD;
785 					argSrc.imm16 = Imm16(con.i16);
786 				}
787 				else {
788 					param.immType = ArgType.DWORD;
789 					argSrc.imm32 = Imm32(con.i32);
790 				}
791 				param.srcKind = AsmArgKind.IMM;
792 				break;
793 
794 			case physicalRegister:
795 				argSrc.reg = indexToRegister(src);
796 				param.srcKind = AsmArgKind.REG;
797 				break;
798 
799 			case global, stackSlot:
800 				// This should not happen. Stack slot or global must go through mov or load instruction.
801 				context.internal_error("Cannot encode %s %s %s in %s %s", op, dst, src, context.idString(lir.name), instrIndex);
802 		}
803 		gen.encodeRegular(argDst, argSrc, param);
804 	}
805 
806 	/// Generate move from src operand to dst operand. Size of destination is used
807 	void genMove(IrIndex dst, IrIndex src)
808 	{
809 		// i64 <- i32 must be 32bit move if both sides are registers.
810 		IrArgSize argSize;
811 		if (src.isPhysReg)
812 			argSize = cast(IrArgSize)min(dst.physRegSize, src.physRegSize);
813 		else
814 			argSize = cast(IrArgSize)dst.physRegSize;
815 
816 		version(emit_mc_print) writefln("genMove %s %s %s", dst, src, argSize);
817 		MoveType moveType = calcMoveType(dst.kind, src.kind);
818 
819 		if (moveType != MoveType.invalid && dst == src) return;
820 
821 		Register srcReg = cast(Register)src.physRegIndex;
822 		Register dstReg = cast(Register)dst.physRegIndex;
823 
824 		switch(moveType)
825 		{
826 			default:
827 				context.internal_error("Invalid move from %s to %s", IrIndexDump(dst, context, lir), IrIndexDump(src, context, lir));
828 
829 			case MoveType.const_to_reg:
830 				IrConstant con = context.constants.get(src);
831 				version(emit_mc_print) writefln("  move.%s reg:%s, con:%s", argSize, dstReg, con.i64);
832 				if (con.i64 == 0) // xor
833 				{
834 					if (dst.physRegClass == AMD64_REG_CLASS.GPR) {
835 						AsmArg argDst = {reg : dstReg};
836 						AsmArg argSrc = {reg : dstReg};
837 						AsmOpParam param = AsmOpParam(AsmArgKind.REG, AsmArgKind.REG, AMD64OpRegular.xor, cast(ArgType)IrArgSize.size32);
838 						gen.encodeRegular(argDst, argSrc, param);
839 					} else if (dst.physRegClass == AMD64_REG_CLASS.XMM) {
840 						// TODO: replace with pxor
841 						// See: https://stackoverflow.com/questions/33666617/what-is-the-best-way-to-set-a-register-to-zero-in-x86-assembly-xor-mov-or-and
842 						gen.xorps(dstReg, dstReg);
843 					}
844 				}
845 				else
846 				{
847 					if (dst.physRegClass == AMD64_REG_CLASS.GPR) {
848 						final switch(argSize) with(IrArgSize) {
849 							case size8: gen.movb(dstReg, Imm8(con.i8)); break;
850 							case size16: gen.movw(dstReg, Imm16(con.i16)); break;
851 							case size32: gen.movd(dstReg, Imm32(con.i32)); break;
852 							case size64:
853 								if (!con.intFitsIn32Bits)
854 									gen.movq(dstReg, Imm64(con.i64));
855 								else {
856 									if (con.u32_top == uint.max) {
857 										gen.movq(dstReg, Imm32(con.i32)); // sign-extend 32bit constant to 64bit register
858 									} else {
859 										gen.movd(dstReg, Imm32(con.i32)); // zero-extend 32bit constant to 64bit register
860 									}
861 								}
862 								break;
863 							case size128, size256, size512:
864 								context.internal_error("Not implemented: const_to_reg %s %s", dst, src);
865 						}
866 					} else if (dst.physRegClass == AMD64_REG_CLASS.XMM) {
867 						LinkIndex roSectionIndex = context.builtinSections[ObjectSectionType.ro_data];
868 						ObjectSymbol* funcSym = context.objSymTab.getSymbol(fun.backendData.objectSymIndex);
869 						ObjectSymbol sym = {
870 							kind : ObjectSymbolKind.isLocal,
871 							sectionIndex : roSectionIndex,
872 							moduleIndex : funcSym.moduleIndex,
873 							id : context.idMap.getOrReg(context, ":float"),
874 						};
875 						LinkIndex symIndex = context.objSymTab.addSymbol(sym);
876 						ObjectSymbol* globalSym = context.objSymTab.getSymbol(symIndex);
877 						ObjectSection* roSection = context.objSymTab.getSection(roSectionIndex);
878 						globalSym.sectionOffset = cast(uint)roSection.buffer.length;
879 
880 						final switch(argSize) with(IrArgSize) {
881 							case size32:
882 								globalSym.setInitializer(context.roStaticDataBuffer.nextPtr[0..4]);
883 								context.roStaticDataBuffer.put(con.i32);
884 								gen.movd_xr(dstReg, memAddrRipDisp32(0));
885 								break;
886 							case size64:
887 								globalSym.setInitializer(context.roStaticDataBuffer.nextPtr[0..8]);
888 								context.roStaticDataBuffer.put(con.i64);
889 								gen.movq_xr(dstReg, memAddrRipDisp32(0));
890 								break;
891 							case size8, size16, size128, size256, size512: context.internal_error("genMove XMM <- const %s", argSize);
892 						}
893 						addRefTo(symIndex);
894 					}
895 				}
896 				break;
897 
898 			// copy address of global into register
899 			case MoveType.global_to_reg:
900 				context.assertf(dst.physRegClass == AMD64_REG_CLASS.GPR, "global_to_reg %s", dst);
901 				// HACK, TODO: 32bit version of reg is incoming, while for ptr 64bits are needed
902 				MemAddress addr = memAddrRipDisp32(0);
903 				gen.lea(dstReg, addr, cast(ArgType)IrArgSize.size64);
904 				addRefTo(src);
905 				break;
906 
907 			// copy address of function into register
908 			case MoveType.func_to_reg:
909 				context.assertf(dst.physRegClass == AMD64_REG_CLASS.GPR, "func_to_reg %s", dst);
910 				FunctionDeclNode* func = context.getFunction(src);
911 				LinkIndex entityIndex = func.backendData.objectSymIndex;
912 				ObjectSymbol* sym = context.objSymTab.getSymbol(entityIndex);
913 				if (sym.isIndirect) {
914 					// read address from the import section
915 					MemAddress addr = memAddrRipDisp32(0);
916 					gen.mov(dstReg, addr, cast(ArgType)IrArgSize.size64);
917 				} else {
918 					// take address of the symbol
919 					MemAddress addr = memAddrRipDisp32(0);
920 					gen.lea(dstReg, addr, cast(ArgType)IrArgSize.size64);
921 				}
922 				addRefTo(entityIndex);
923 				break;
924 
925 			case MoveType.reg_to_reg:
926 				version(emit_mc_print) writefln("  move.%s reg:%s, reg:%s", argSize, dstReg, srcReg);
927 				if (src.physRegClass == AMD64_REG_CLASS.XMM && dst.physRegClass == AMD64_REG_CLASS.XMM) {
928 					final switch(argSize) with(IrArgSize) {
929 						case size8, size16:
930 							context.internal_error("Not implemented: reg_to_reg %s %s", dst, src);
931 						case size32: gen.movss(dstReg, srcReg); break;
932 						case size64: gen.movsd(dstReg, srcReg); break;
933 						case size128: gen.movups(dstReg, srcReg); break;
934 						case size256, size512:
935 							context.internal_error("Not implemented: reg_to_reg %s %s", dst, src);
936 					}
937 				} else if (src.physRegClass == AMD64_REG_CLASS.XMM) {
938 					final switch(argSize) with(IrArgSize) {
939 						case size32: gen.movd_rx(dstReg, srcReg); break;
940 						case size64: gen.movq_rx(dstReg, srcReg); break;
941 						case size8, size16, size128, size256, size512:
942 							context.internal_error("Not implemented: reg_to_reg %s %s", dst, src);
943 					}
944 				} else if (dst.physRegClass == AMD64_REG_CLASS.XMM) {
945 					final switch(argSize) with(IrArgSize) {
946 						case size32: gen.movd_xr(dstReg, srcReg); break;
947 						case size64: gen.movq_xr(dstReg, srcReg); break;
948 						case size8, size16, size128, size256, size512:
949 							context.internal_error("Not implemented: reg_to_reg %s %s", dst, src);
950 					}
951 				} else {
952 					if (dstReg != srcReg) {
953 						gen.mov(dstReg, srcReg, cast(ArgType)argSize);
954 					}
955 				}
956 				break;
957 
958 			// copy address of stack slot into register
959 			case MoveType.stack_to_reg:
960 				context.assertf(dst.physRegClass == AMD64_REG_CLASS.GPR, "stack_to_reg %s", dst);
961 				gen.lea(dstReg, localVarMemAddress(src), cast(ArgType)IrArgSize.size64);
962 				break;
963 		}
964 	}
965 
966 	void fix_PC_REL_32(PC fixup, PC target)
967 	{
968 		*cast(Imm32*)(fixup-4) = jumpOffset(fixup, target);
969 	}
970 
971 	// nextInstr is address
972 	void fix_PC_REL_CUSTOM(Imm32* offset, PC nextInstr, PC target)
973 	{
974 		*offset = jumpOffset(nextInstr, target);
975 	}
976 
977 	void doMemToReg(IrIndex dst, MemAddress srcMem, IrArgSize argSize) {
978 		Register dstReg = indexToRegister(dst);
979 		if (dst.physRegClass == AMD64_REG_CLASS.XMM) {
980 			final switch(argSize) with(IrArgSize) {
981 				case size32: gen.movd_xr(dstReg, srcMem); break;
982 				case size64: gen.movq_xr(dstReg, srcMem); break;
983 				case size128: gen.movups(dstReg, srcMem); break;
984 				case size8, size16, size256, size512: context.internal_error("doMemToReg %s", argSize);
985 			}
986 		} else {
987 			gen.mov(dstReg, srcMem, cast(ArgType)argSize);
988 		}
989 	}
990 
991 	/// Generate move from src operand to dst operand. argType describes the size of operands.
992 	// If src is phys reg then it is used as address base.
993 	// dst must be phys reg
994 	void genLoad(IrIndex dst, IrIndex src)
995 	{
996 		IrArgSize argSize = cast(IrArgSize)dst.physRegSize;
997 		bool valid = dst.isPhysReg && (src.isPhysReg || src.isStackSlot || src.isGlobal);
998 		context.assertf(valid, "Invalid load %s -> %s", src.kind, dst.kind);
999 
1000 		switch(src.kind) with(IrValueKind) {
1001 			case physicalRegister: doMemToReg(dst, memAddrBase(indexToRegister(src)), argSize); break;
1002 			case stackSlot: doMemToReg(dst, localVarMemAddress(src), argSize); break;
1003  			case global:
1004 				doMemToReg(dst, memAddrRipDisp32(0), argSize);
1005 				addRefTo(src);
1006 				break;
1007 
1008 			default:
1009 				context.internal_error("invalid source of load %s", src.kind);
1010 		}
1011 	}
1012 
1013 	// dst must be of pointer type
1014 	// dst is pointer of unknown type (that's why we need explicit argType)
1015 	void genStore(IrIndex dst, IrIndex src, IrArgSize argSize)
1016 	{
1017 		context.assertf(!src.isGlobal,
1018 			"store %s <- %s, must go through intermediate register",
1019 			dst.kind, src.kind);
1020 
1021 		void doRegToMem(MemAddress dstMem) {
1022 			if (src.physRegClass == AMD64_REG_CLASS.XMM) {
1023 				Register srcReg = indexToRegister(src);
1024 				final switch(argSize) with(IrArgSize) {
1025 					case size32: gen.movd_rx(dstMem, srcReg); break;
1026 					case size64: gen.movq_rx(dstMem, srcReg); break;
1027 					case size128: gen.movups(dstMem, srcReg); break;
1028 					case size8, size16, size256, size512: context.internal_error("doRegToMem %s", argSize);
1029 				}
1030 			} else {
1031 				Register srcReg = indexToRegister(src);
1032 				gen.mov(dstMem, srcReg, cast(ArgType)argSize);
1033 			}
1034 		}
1035 		void doConToMem(MemAddress dstMem, IrConstant con) {
1036 			final switch(argSize) with(IrArgSize) {
1037 				case size8: gen.movb(dstMem, Imm8(con.i8)); break;
1038 				case size16: gen.movw(dstMem, Imm16(con.i16)); break;
1039 				case size32: gen.movd(dstMem, Imm32(con.i32)); break;
1040 				case size64:
1041 					context.assertf(con.intFitsIn32Bits, "Constant 0x%X is too big", con.i64);
1042 					gen.movq(dstMem, Imm32(con.i32));
1043 					break;
1044 				case size128, size256, size512: context.internal_error("doConToMem %s", argSize);
1045 			}
1046 		}
1047 
1048 		MoveType moveType = calcMoveType(dst.kind, src.kind);
1049 		switch (moveType) with(MoveType)
1050 		{
1051 			case const_to_stack:
1052 				IrConstant con = context.constants.get(src);
1053 				MemAddress dstMem = localVarMemAddress(dst);
1054 				doConToMem(dstMem, con);
1055 				break;
1056 			case const_to_reg:
1057 				IrConstant con = context.constants.get(src);
1058 				Register dstReg = indexToRegister(dst);
1059 				MemAddress dstMem = memAddrBase(dstReg);
1060 				doConToMem(dstMem, con);
1061 				break;
1062 			case reg_to_stack:
1063 				MemAddress dstMem = localVarMemAddress(dst);
1064 				doRegToMem(dstMem);
1065 				break;
1066 			case reg_to_reg:
1067 				Register dstReg = indexToRegister(dst);
1068 				MemAddress dstMem = memAddrBase(dstReg);
1069 				doRegToMem(dstMem);
1070 				break;
1071 			case const_to_global:
1072 				IrConstant con = context.constants.get(src);
1073 				MemAddress dstMem = memAddrRipDisp32(0);
1074 				doConToMem(dstMem, con);
1075 				addRefTo(dst, 8);
1076 				break;
1077 			case reg_to_global:
1078 				MemAddress dstMem = memAddrRipDisp32(0);
1079 				doRegToMem(dstMem);
1080 				addRefTo(dst);
1081 				break;
1082 			default:
1083 				context.internal_error("store %s <- %s is not implemented", dst.kind, src.kind);
1084 		}
1085 	}
1086 }
1087 
1088 MoveType calcMoveType(IrValueKind dst, IrValueKind src)
1089 {
1090 	switch(dst) with(IrValueKind) {
1091 		case none, array, constant: return MoveType.invalid;
1092 		case virtualRegister: return MoveType.invalid;
1093 		case physicalRegister:
1094 			switch(src) with(IrValueKind) {
1095 				case constant, constantZero: return MoveType.const_to_reg;
1096 				case global: return MoveType.global_to_reg;
1097 				case physicalRegister: return MoveType.reg_to_reg;
1098 				case stackSlot: return MoveType.stack_to_reg;
1099 				case func: return MoveType.func_to_reg;
1100 				default: return MoveType.invalid;
1101 			}
1102 		case stackSlot:
1103 			switch(src) with(IrValueKind) {
1104 				case constant: return MoveType.const_to_stack;
1105 				case physicalRegister: return MoveType.reg_to_stack;
1106 				default: return MoveType.invalid;
1107 			}
1108 		case global:
1109 			switch(src) with(IrValueKind) {
1110 				case constant: return MoveType.const_to_global;
1111 				case physicalRegister: return MoveType.reg_to_global;
1112 				default: return MoveType.invalid;
1113 			}
1114 		default: return MoveType.invalid;
1115 	}
1116 }
1117 
1118 enum MoveType
1119 {
1120 	invalid,
1121 	const_to_reg,
1122 	const_to_global,
1123 	global_to_reg,
1124 	const_to_stack,
1125 	reg_to_reg,
1126 	reg_to_stack,
1127 	reg_to_global,
1128 	stack_to_reg,
1129 	const_to_mem,
1130 	reg_to_mem,
1131 	mem_to_reg,
1132 	func_to_reg,
1133 }