1 /// Copyright: Copyright (c) 2018-2020 Penechko.
2 /// License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0).
3 /// Authors: Andrey Penechko.
4 
5 /// Linear scan register allocation pass
6 ///
7 /// Notes:
8 /// Non-split current interval does not intersect with inactive intervals because of SSA form
9 ///
10 /// Splits need to be at the odd position. This gives:
11 ///  a) no intervals of form: [x; x)
12 ///  b) no problems with two operand form mov instruction before two operand instruction
13 ///     two operand form can insert mov from first operand into result register
14 ///     and in case first operand's interval was split it would start at the instruction and not include the mov
15 ///     as result register allocator may allocate arg0 register to reloaded argument,
16 ///     but reload will precede the mov, thus overwriting the value
17 ///     100 | v0 = add v1, v2   | eax(v0) = add ecx(v1), ecx(v2)
18 ///     v0 [100; 200) eax
19 ///     v1 [50; 100) ecx
20 ///     v2 [40; 99) s0 [100; 109) ecx
21 ///    becomes
22 ///      99|    ecx = load i32* s3  // split move overwrites ecx that is used in the next instruction
23 ///      99|    eax = mov ecx       // fix two operand form
24 ///     100|    eax = add eax, ecx
25 ///
26 /// As result of splits being on odd positions another invariant becomes available:
27 ///   All ranges have distinct `from` and `to` positions. Zero length ranges become impossible.
28 ///   range.from < range.to
29 module vox.be.reg_alloc.linear_scan;
30 
31 import std.array : empty;
32 import std.string : format;
33 import std.stdio : writeln, write, writef, writefln, stdout;
34 import std.format : formattedWrite, FormatSpec;
35 import std.range : repeat;
36 import std.range : chain;
37 import std.algorithm : min, max, sort, swap;
38 
39 import vox.all;
40 
41 //version = RAPrint;
42 //version = RAPrint_resolve;
43 
44 /// Does linear scan register allocation.
45 /// Uses live intervals produced by pass_live_intervals
46 void pass_linear_scan(LinearScan* linearScan)
47 {
48 	CompilationContext* c = linearScan.context;
49 	assert(!linearScan.fun.isExternal);
50 
51 	linearScan.scanFun();
52 
53 	if (c.printLirRA && c.printDumpOf(linearScan.fun)) {
54 		IrFunction* lirData = c.getAst!IrFunction(linearScan.fun.backendData.lirData);
55 		dumpFunction(c, lirData, "RA");
56 		linearScan.livePtr.dump(c, lirData);
57 	}
58 }
59 
60 struct RegisterState
61 {
62 	IntervalIndex activeInterval;
63 }
64 
65 pure ubyte arrayPhysRegIndex(IrIndex reg) {
66 	return cast(ubyte)(reg.physRegClass * NUM_REGS_PER_CLASS + reg.physRegIndex);
67 }
68 
69 struct PhysRegisters
70 {
71 	RegisterState[NUM_TOTAL_REGS] registers;
72 	uint[NUM_TOTAL_REGS] usePos;
73 	uint[NUM_TOTAL_REGS] blockPos;
74 	// bitmask per class
75 	FullRegSet volatileRegs;
76 	FullRegSet calleeSavedRegs;
77 	FullRegSet usedRegs;
78 
79 	ref RegisterState opIndex(IrIndex reg) return {
80 		assert(reg.isPhysReg, format("%s", reg));
81 		return registers[reg.physRegClass * NUM_REGS_PER_CLASS + reg.physRegIndex];
82 	}
83 
84 	void setup(CompilationContext* c, IrFunction* lir, MachineInfo* machineInfo)
85 	{
86 		CallConv* callConv = lir.getCallConv(c);
87 
88 		if (c.debugRegAlloc) {
89 			volatileRegs = callConv.volatileRegs.lowest(5); // only 5 regs are available for tests
90 			calleeSavedRegs = FullRegSet.init;
91 		} else {
92 			volatileRegs = callConv.volatileRegs;
93 			calleeSavedRegs = callConv.calleeSaved;
94 			if (c.useFramePointer) {
95 				// make frame pointer not allocatable
96 				calleeSavedRegs.disable(callConv.framePointer);
97 			}
98 		}
99 		usedRegs = FullRegSet.init;
100 		registers = typeof(registers).init;
101 	}
102 
103 	void setUsePos(IrIndex reg, uint pos) {
104 		usePos[reg.physRegClass * NUM_REGS_PER_CLASS + reg.physRegIndex] = min(usePos[reg.physRegClass * NUM_REGS_PER_CLASS + reg.physRegIndex], pos);
105 	}
106 
107 	void setBlockPos(IrIndex reg, uint pos) {
108 		blockPos[reg.physRegClass * NUM_REGS_PER_CLASS + reg.physRegIndex] = min(blockPos[reg.physRegClass * NUM_REGS_PER_CLASS + reg.physRegIndex], pos);
109 		usePos[reg.physRegClass * NUM_REGS_PER_CLASS + reg.physRegIndex] = min(usePos[reg.physRegClass * NUM_REGS_PER_CLASS + reg.physRegIndex], pos);
110 	}
111 
112 	void resetBlockPos()
113 	{
114 		usePos[] = MAX_USE_POS;
115 		blockPos[] = MAX_USE_POS;
116 	}
117 
118 	void resetUsePos()
119 	{
120 		usePos[] = MAX_USE_POS;
121 	}
122 
123 	void markAsUsed(IrIndex reg)
124 	{
125 		usedRegs |= PhysReg(cast(ubyte)reg.physRegClass, cast(ubyte)reg.physRegIndex);
126 	}
127 }
128 
129 // data shared between all split children of interval
130 // identified by vreg index
131 struct VregState
132 {
133 	IrIndex spillSlot;
134 }
135 
136 /// Implementation of:
137 /// "Optimized Interval Splitting in a Linear Scan Register Allocator"
138 /// "Linear Scan Register Allocation on SSA Form"
139 struct LinearScan
140 {
141 	VregState[] vregState;
142 	Array!IntervalIndex activeVirtual;
143 	Array!IntervalIndex activeFixed;
144 	Array!IntervalIndex inactiveVirtual;
145 	Array!IntervalIndex inactiveFixed;
146 	// unhandled = list of intervals sorted by increasing start positions
147 	IntervalPriorityQueue unhandled;
148 	// List of handled split intervals that begin in the middle of basic block
149 	// Intervals are with increasing start position
150 	Array!IntervalIndex pendingMoveSplits;
151 	PhysRegisters physRegs;
152 	CompilationContext* context;
153 	IrBuilder* builder;
154 	FunctionDeclNode* fun;
155 
156 	LivenessInfo* livePtr;
157 	ref LivenessInfo live() { return *livePtr; }
158 	IrFunction* lir;
159 
160 	IrIndex scratchSpillSlot;
161 
162 	void freeMem() {
163 		activeVirtual.free(context.arrayArena);
164 		activeFixed.free(context.arrayArena);
165 		inactiveVirtual.free(context.arrayArena);
166 		inactiveFixed.free(context.arrayArena);
167 		pendingMoveSplits.free(context.arrayArena);
168 		unhandled.free(context.arrayArena);
169 	}
170 
171 	void checkActiveImpl(uint position, ref Array!IntervalIndex active, ref Array!IntervalIndex inactive) {
172 		size_t index;
173 		// // check for intervals in active that are handled or inactive
174 		// for each interval it in active do
175 		while (index < active.length)
176 		{
177 			IntervalIndex activeId = active[index];
178 			LiveInterval* activeInt = &live.intervals[activeId];
179 
180 			// already spilled
181 			if (activeInt.reg.isStackSlot)
182 			{
183 				version(RAPrint) writefln("  move %s active -> handled, spilled to %s", activeId, activeInt.reg);
184 				active.removeInPlace(index);
185 				continue;
186 			}
187 
188 			LiveRangeIndex rangeId = activeInt.getRightmostRange(position);
189 
190 			// if it ends before position then
191 			if (rangeId.isNull)
192 			{
193 				// move it from active to handled
194 				version(RAPrint) writefln("  move %s active -> handled", activeId);
195 				active.removeInPlace(index);
196 			}
197 			// else if it does not cover position then
198 			else if(!activeInt.ranges[rangeId].contains(position))
199 			{
200 				// move it from active to inactive
201 				version(RAPrint) writefln("  move %s active -> inactive", activeId);
202 				active.removeInPlace(index);
203 				inactive.put(context.arrayArena, activeId);
204 			}
205 			else
206 			{
207 				++index;
208 			}
209 		}
210 	}
211 
212 	void checkActive(uint position) {
213 		checkActiveImpl(position, activeFixed, inactiveFixed);
214 		checkActiveImpl(position, activeVirtual, inactiveVirtual);
215 	}
216 
217 	void checkInactiveImpl(uint position, ref Array!IntervalIndex active, ref Array!IntervalIndex inactive) {
218 		size_t index = 0;
219 		// check for intervals in inactive that are handled or active
220 		// for each interval it in inactive do
221 		while (index < inactive.length)
222 		{
223 			IntervalIndex inactiveId = inactive[index];
224 			LiveInterval* inactiveInt = &live.intervals[inactiveId];
225 			LiveRangeIndex rangeId = inactiveInt.getRightmostRange(position);
226 
227 			// if it ends before position then
228 			if (rangeId.isNull)
229 			{
230 				// move it from inactive to handled
231 				version(RAPrint) writefln("  move %s inactive -> handled", inactiveId);
232 				inactive.removeInPlace(index);
233 			}
234 			// 	else if it covers position then
235 			else if(inactiveInt.ranges[rangeId].contains(position))
236 			{
237 				// move it from inactive to active
238 				version(RAPrint) writefln("  move %s inactive -> active", inactiveId);
239 				active.put(context.arrayArena, inactiveId);
240 				inactive.removeInPlace(index);
241 			}
242 			else
243 				++index;
244 		}
245 	}
246 
247 	void checkInactive(uint position) {
248 		checkInactiveImpl(position, activeFixed, inactiveFixed);
249 		checkInactiveImpl(position, activeVirtual, inactiveVirtual);
250 	}
251 
252 	/*
253 		LinearScan
254 		unhandled = list of intervals sorted by increasing start positions
255 		active = { }; inactive = { }; handled = { }
256 
257 		while unhandled != { } do
258 			current = pick and remove first interval from unhandled
259 			position = start position of current
260 
261 			// check for intervals in active that are handled or inactive
262 			for each interval it in active do
263 				if it ends before position then
264 					move it from active to handled
265 				else if it does not cover position then
266 					move it from active to inactive
267 
268 			// check for intervals in inactive that are handled or active
269 			for each interval it in inactive do
270 				if it ends before position then
271 					move it from inactive to handled
272 				else if it covers position then
273 					move it from inactive to active
274 
275 			// find a register for current
276 			TryAllocateFreeReg
277 			if allocation failed then AllocateBlockedReg
278 
279 			if current has a register assigned then add current to active
280 	*/
281 	void scanFun()
282 	{
283 		import std.container.binaryheap;
284 		lir = context.getAst!IrFunction(fun.backendData.lirData);
285 		physRegs.setup(context, lir, context.machineInfo);
286 		vregState = context.allocateTempArray!VregState(lir.numVirtualRegisters);
287 
288 		//writefln("\nstart scan of %s", context.idString(lir.name));
289 		scope(exit) {
290 			lir = null;
291 		}
292 
293 		// active = { }; inactive = { }; handled = { }
294 		activeVirtual.clear;
295 		activeFixed.clear;
296 		inactiveVirtual.clear;
297 		inactiveFixed.clear;
298 		pendingMoveSplits.clear;
299 		unhandled.clear;
300 
301 		size_t i = 0;
302 		foreach (ref LiveInterval it; live.virtualIntervals) {
303 			if (!it.ranges.empty)
304 				unhandled.insert(context.arrayArena, live.vindex(i), it.from);
305 			++i;
306 		}
307 		i = 0;
308 		foreach (ref LiveInterval it; live.physicalIntervals) {
309 			if (!it.ranges.empty)
310 				inactiveFixed.put(context.arrayArena, live.pindex(i));
311 			++i;
312 		}
313 
314 		IrIndex currentBlock = lir.entryBasicBlock;
315 
316 		// while unhandled != { } do
317 		while (!unhandled.empty)
318 		{
319 			// current = pick and remove first interval from unhandled
320 			IntervalIndex currentIndex = unhandled.removeFront;
321 			LiveInterval* currentInterval = &live.intervals[currentIndex];
322 
323 			// position = start position of current
324 			uint position = currentInterval.from;
325 			version(RAPrint) writefln("current %s %s pos %s firstUse %s", currentIndex, *currentInterval, position, currentInterval.firstUse);
326 
327 			checkActive(position);
328 			checkInactive(position);
329 
330 			if (currentInterval.isSplitChild)
331 			{
332 				version(RAPrint) writefln("  current is split child");
333 				if (!live.isBlockStartAt(lir, position, currentBlock))
334 				{
335 					version(RAPrint) writefln("  current is in the middle of the block %s", currentBlock);
336 					pendingMoveSplits.put(context.arrayArena, currentIndex);
337 				}
338 
339 				// skip allocation if this interval has spill slot assigned
340 				if (currentInterval.reg.isStackSlot) {
341 					version(RAPrint) writefln("  current is spilled to %s. skip allocation", currentInterval.reg);
342 					continue;
343 				}
344 			}
345 
346 			ubyte regClass = currentInterval.regClass;
347 
348 			// find a register for current
349 			bool success = tryAllocateFreeReg(currentIndex, regClass);
350 
351 			// if allocation failed then AllocateBlockedReg
352 			if (!success) {
353 				allocateBlockedReg(fun, currentIndex, position, regClass);
354 			}
355 
356 			currentInterval = &live.intervals[currentIndex]; // intervals may have reallocated
357 
358 			version(RAPrint) writefln("  alloc current %s %s %s to %s", currentIndex, currentInterval, *currentInterval, IrIndexDump(currentInterval.reg, context, lir));
359 
360 			// if current has a register assigned then add current to active
361 			if (currentInterval.reg.isPhysReg)
362 			{
363 				// don't add stack slot assigned interval to active
364 				version(RAPrint) writefln("  move current %s %s -> active", currentIndex, *currentInterval);
365 				activeVirtual.put(context.arrayArena, currentIndex);
366 			}
367 
368 			version(RAPrint) writeln;
369 		}
370 
371 		//live.dump(context, lir);
372 
373 		MoveSolver moveSolver = MoveSolver(context);
374 		moveSolver.setup(lir);
375 		fixInstructionArgs(fun);
376 		// this pass inserts moves for spills and loads
377 		resolveInsertSplitMoves(moveSolver);
378 		// this pass inserts moves to resolve two-operand form,
379 		// so it needs to be after `resolveInsertSplitMoves`
380 		fixInstructionResults(fun);
381 		resolveControlFlow(moveSolver);
382 		genSaveCalleeSavedRegs();
383 
384 		moveSolver.release();
385 		lir.removeAllPhis;
386 
387 		if (context.validateIr) validateIrFunction(context, lir, "Reg alloc");
388 
389 		//writefln("finished scan of %s\n", context.idString(lir.name));
390 	}
391 
392 	/*
393 	TRYALLOCATEFREEREG
394 		set freeUntilPos of all physical registers to maxInt
395 
396 		for each interval it in active do
397 			freeUntilPos[it.reg] = 0
398 		for each interval it in inactive intersecting with current do
399 			freeUntilPos[it.reg] = next intersection of it with current
400 		// clarification: if current intersects active and inactive, freeUntilPos is 0
401 
402 		reg = register with highest freeUntilPos
403 		if freeUntilPos[reg] = 0 then
404 			// no register available without spilling
405 			allocation failed
406 		else if current ends before freeUntilPos[reg] then
407 			// register available for the whole interval
408 			current.reg = reg
409 		else
410 			// register available for the first part of the interval
411 			current.reg = reg
412 			split current before freeUntilPos[reg]
413 	*/
414 	bool tryAllocateFreeReg(IntervalIndex currentId, ubyte regClass)
415 	{
416 		// set usePos of all physical registers to maxInt
417 		physRegs.resetUsePos;
418 
419 		LiveInterval* currentIt = &live.intervals[currentId];
420 		uint currentEnd = currentIt.to;
421 
422 		static struct FreeUntilPos {
423 			uint num;
424 			void toString(scope void delegate(const(char)[]) sink) {
425 				if (num == MAX_USE_POS) formattedWrite(sink, "max");
426 				else formattedWrite(sink, "%s", num);
427 			}
428 		}
429 
430 		// for each interval it in active do
431 		//     usePos[it.reg] = 0
432 		version(RAPrint) writeln("  active:");
433 		foreach (IntervalIndex activeId; activeFixed) {
434 			LiveInterval* it = &live.intervals[activeId];
435 			physRegs.usePos[it.reg.arrayPhysRegIndex] = 0;
436 			version(RAPrint) writefln("   intp %s %s reg %s (next use 0)", activeId, IrIndexDump(it.definition, context, lir), IrIndexDump(it.reg, context, lir));
437 		}
438 		foreach (IntervalIndex activeId; activeVirtual) {
439 			LiveInterval* it = &live.intervals[activeId];
440 			physRegs.usePos[it.reg.arrayPhysRegIndex] = 0;
441 			version(RAPrint) writefln("   intv %s %s reg %s (next use 0)", activeId, *it, IrIndexDump(it.reg, context, lir));
442 		}
443 
444 		// for each interval it in inactive intersecting with current do
445 		//   usePos[it.reg] = next intersection of it with current
446 		version(RAPrint) writeln("  inactive:");
447 		foreach (IntervalIndex inactiveId; inactiveFixed) {
448 			LiveInterval* it = &live.intervals[inactiveId];
449 			version(RAPrint) writef("   intp %s %s (next use %s)", IrIndexDump(it.reg, context, lir), IrIndexDump(it.definition, context, lir), FreeUntilPos(physRegs.usePos[it.reg.arrayPhysRegIndex]));
450 
451 			// if current intersects both active and inactive, usePos stays 0
452 			if (physRegs.usePos[it.reg.arrayPhysRegIndex] == 0) { version(RAPrint) writeln;
453 				continue;
454 			}
455 			// in case there is no intersection will return MAX_USE_POS (noop)
456 			uint inactiveIntersection = firstIntersection(currentIt, it);
457 
458 			// Register may be already occupied by active or inactive interval, so preserve it's use pos
459 			physRegs.usePos[it.reg.arrayPhysRegIndex] = min(physRegs.usePos[it.reg.arrayPhysRegIndex], inactiveIntersection);
460 			version(RAPrint) writefln(":=%s", FreeUntilPos(inactiveIntersection));
461 		}
462 
463 		// We only need to check inactive intervals when current interval was split
464 		// because otherwise SSA form is intact, and there may not be intersections with inactive intervals
465 		if (currentIt.isSplitChild)
466 		{
467 			foreach (IntervalIndex inactiveId; inactiveVirtual) {
468 				LiveInterval* it = &live.intervals[inactiveId];
469 				assert(it.reg.isPhysReg, "not a reg");
470 				version(RAPrint) writef("   intv %s %s (next use %s)", IrIndexDump(it.reg, context, lir), it.definition, FreeUntilPos(physRegs.usePos[it.reg.arrayPhysRegIndex]));
471 
472 				// if current intersects both active and inactive, usePos stays 0
473 				if (physRegs.usePos[it.reg.arrayPhysRegIndex] == 0) { version(RAPrint) writeln;
474 					continue;
475 				}
476 				// in case there is no intersection will return MAX_USE_POS (noop)
477 				uint inactiveIntersection = firstIntersection(currentIt, it);
478 				// Register may be already occupied by active or inactive interval, so preserve it's use pos
479 				physRegs.usePos[it.reg.arrayPhysRegIndex] = min(physRegs.usePos[it.reg.arrayPhysRegIndex], inactiveIntersection);
480 				version(RAPrint) writefln(":=%s", FreeUntilPos(inactiveIntersection));
481 			}
482 		}
483 
484 		version(RAPrint) writefln("  Try alloc free reg for %s %s", *currentIt, currentId);
485 
486 		// reg = register with highest usePos
487 		uint maxPos = 0;
488 		IrIndex reg;
489 
490 		version(RAPrint) write("  candidates:");
491 		// prioritize volatile regs by first iterating them
492 		foreach (ubyte regIndex; physRegs.volatileRegs.classes[regClass]) {
493 			uint usePos = physRegs.usePos[regClass * NUM_REGS_PER_CLASS + regIndex];
494 			version(RAPrint) writef(" %s:%s", IrIndexDump(IrIndex(regIndex, ArgType.QWORD, regClass), context, lir), FreeUntilPos(usePos));
495 			if (usePos > maxPos) {
496 				maxPos = usePos;
497 				reg = IrIndex(regIndex, ArgType.QWORD, regClass);
498 			}
499 		}
500 		foreach (ubyte regIndex; physRegs.calleeSavedRegs.classes[regClass]) {
501 			uint usePos = physRegs.usePos[regClass * NUM_REGS_PER_CLASS + regIndex];
502 			version(RAPrint) writef(" %s:%s", IrIndexDump(IrIndex(regIndex, ArgType.QWORD, regClass), context, lir), FreeUntilPos(usePos));
503 			if (usePos > maxPos) {
504 				maxPos = usePos;
505 				reg = IrIndex(regIndex, ArgType.QWORD, regClass);
506 			}
507 		}
508 		version(RAPrint) writeln;
509 
510 		// reg stored in hint
511 		IrIndex hintReg = currentIt.storageHint;
512 
513 		if (maxPos == 0)
514 		{
515 			// no register available without spilling
516 			version(RAPrint) writeln("    no register available without spilling");
517 			// special case
518 			// if current is interval without uses we can just spill the whole interval
519 			// this happens when interval is used at the beginning of the loop, then split in the middle
520 			// the tail part then contains no uses
521 			if (currentIt.uses.empty)
522 			{
523 				assignSpillSlot(currentIt);
524 				version(RAPrint) writeln("    spill whole interval because of no uses");
525 				return true;
526 			}
527 			return false;
528 		} else {
529 			version(RAPrint) writefln("    hint is %s", IrIndexDump(hintReg, context, lir));
530 			if (hintReg.isVirtReg) hintReg = live.vint(hintReg).reg;
531 			if (hintReg.isDefined) {
532 				version(RAPrint) writefln("    currentEnd %s use %s hint %s", currentEnd, physRegs.usePos[hintReg.arrayPhysRegIndex], IrIndexDump(hintReg, context, lir));
533 				if (currentEnd < physRegs.usePos[hintReg.arrayPhysRegIndex]) {
534 					// register available for the whole interval
535 					currentIt.reg = hintReg;
536 					currentIt.reg.physRegSize = typeToRegSize(lir.getValueType(context, currentIt.definition), context);
537 					physRegs.markAsUsed(hintReg);
538 					version(RAPrint) writefln("    alloc hint %s", IrIndexDump(hintReg, context, lir));
539 					return true;
540 				} else {
541 					version(RAPrint) writefln("    hint %s is not available for the whole interval", IrIndexDump(hintReg, context, lir));
542 					// use hint reg with spill if it is one of max use regs
543 					if (physRegs.usePos[hintReg.arrayPhysRegIndex] == maxPos) reg = hintReg;
544 				}
545 			}
546 
547 			if (currentEnd < maxPos) {
548 				// register available for the whole interval
549 				currentIt.reg = reg;
550 				currentIt.reg.physRegSize = typeToRegSize(lir.getValueType(context, currentIt.definition), context);
551 				physRegs.markAsUsed(reg);
552 				version(RAPrint) writefln("    alloc %s", IrIndexDump(reg, context, lir));
553 				return true;
554 			} else {
555 				// split
556 				version(RAPrint) writefln("    alloc %s + split at %s", IrIndexDump(reg, context, lir), maxPos);
557 
558 				// prevent split at even x when interval starts at x-1
559 				if (currentIt.from + 1 == maxPos) return false;
560 
561 				splitBefore(currentId, maxPos);
562 				currentIt = &live.intervals[currentId]; // intervals may have reallocated
563 
564 				currentIt.reg = reg;
565 				currentIt.reg.physRegSize = typeToRegSize(lir.getValueType(context, currentIt.definition), context);
566 				physRegs.markAsUsed(reg);
567 				return true;
568 			}
569 		}
570 	}
571 
572 	/*
573 	ALLOCATEBLOCKEDREG
574 		set nextUsePos of all physical registers to maxInt
575 
576 		for each interval it in active do
577 			nextUsePos[it.reg] = next use of it after start of current
578 		for each interval it in inactive intersecting with current do
579 			nextUsePos[it.reg] = next use of it after start of current
580 
581 		reg = register with highest nextUsePos
582 		if first usage of current is after nextUsePos[reg] then
583 			// all other intervals are used before current,
584 			// so it is best to spill current itself
585 			assign spill slot to current
586 			split current before its first use position that requires a register
587 		else
588 			// spill intervals that currently block reg
589 			current.reg = reg
590 			split active interval for reg at position
591 			split any inactive interval for reg at the end of its lifetime hole
592 
593 		// make sure that current does not intersect with
594 		// the fixed interval for reg
595 		if current intersects with the fixed interval for reg then
596 			split current before this intersection
597 	*/
598 	// Returns new interval
599 	void allocateBlockedReg(FunctionDeclNode* fun, IntervalIndex currentId, uint currentStart, ubyte regClass)
600 	{
601 		physRegs.resetBlockPos;
602 
603 		LiveInterval* currentIt = &live.intervals[currentId];
604 		assert(currentIt.ranges.length);
605 		version(RAPrint) writefln("allocateBlockedReg of int %s start %s", currentId, currentStart);
606 
607 		foreach (IntervalIndex activeId; activeVirtual) {
608 			LiveInterval* it = &live.intervals[activeId];
609 			physRegs.setUsePos(it.reg, it.nextUseAfter(currentStart));
610 			physRegs[it.reg].activeInterval = activeId;
611 			version(RAPrint) writefln("active virt usePos of %s %s use %s block %s", activeId, *it, physRegs.usePos[it.reg.arrayPhysRegIndex], physRegs.blockPos[it.reg.arrayPhysRegIndex]);
612 		}
613 		// We only need to check inactive intervals when current interval was split
614 		// because otherwise SSA form is intact, and there may not be intersections with inactive intervals
615 		if (currentIt.isSplitChild) {
616 			foreach (inactiveId; inactiveVirtual) {
617 				LiveInterval* it = &live.intervals[inactiveId];
618 				physRegs.setUsePos(it.reg, it.nextUseAfter(currentStart));
619 				version(RAPrint) writefln("inactive virt usePos of %s %s use %s block %s", inactiveId, *it, physRegs.usePos[it.reg.arrayPhysRegIndex], physRegs.blockPos[it.reg.arrayPhysRegIndex]);
620 			}
621 		}
622 		foreach (IntervalIndex activeId; activeFixed) {
623 			LiveInterval* it = &live.intervals[activeId];
624 			physRegs.setBlockPos(it.reg, 0);
625 			physRegs[it.reg].activeInterval = IntervalIndex.NULL;
626 			version(RAPrint) writefln("active fixed usePos of %s %s use %s block %s", activeId, *it, physRegs.usePos[it.reg.arrayPhysRegIndex], physRegs.blockPos[it.reg.arrayPhysRegIndex]);
627 		}
628 		foreach (inactiveId; inactiveFixed) {
629 			LiveInterval* it = &live.intervals[inactiveId];
630 			uint intersection = firstIntersection(currentIt, it);
631 			if (intersection != MAX_USE_POS)
632 			{
633 				physRegs.setBlockPos(it.reg, intersection);
634 			}
635 			version(RAPrint) writefln("inactive fixed usePos of %s %s use %s block %s", inactiveId, *it, physRegs.usePos[it.reg.arrayPhysRegIndex], physRegs.blockPos[it.reg.arrayPhysRegIndex]);
636 		}
637 
638 		// reg = register with highest usePos
639 		uint maxPos = 0;
640 		IrIndex reg;
641 
642 		foreach (ubyte regIndex; (physRegs.volatileRegs | physRegs.calleeSavedRegs).classes[regClass]) {
643 			uint usePos = physRegs.usePos[regClass * NUM_REGS_PER_CLASS + regIndex];
644 			if (usePos > maxPos) {
645 				// if register was only available for the start of interval
646  				// then all uses may have moved into split child
647 				maxPos = usePos;
648 				reg = IrIndex(regIndex, ArgType.QWORD, regClass);
649 			}
650 		}
651 		version(RAPrint) writefln("    candidate %s usePos %s", IrIndexDump(reg, context, lir), maxPos);
652 
653 		uint firstUse = currentIt.firstUse;
654 
655 		if (maxPos < firstUse)
656 		{
657 			version(RAPrint) writefln("  spill current maxUse %s firstUse %s", maxPos, firstUse);
658 			// all other intervals are used before current, so it is best to spill current itself
659 			assignSpillSlot(currentIt);
660 			// split current before its first use position that requires a register
661 			splitBefore(currentId, firstUse);
662 			currentIt = &live.intervals[currentId]; // intervals may have reallocated
663 			version(RAPrint) writefln("    spill current %s", currentId);
664 		}
665 		else
666 		{
667 			// spill intervals that currently block reg
668 			currentIt.reg = reg;
669 			currentIt.reg.physRegSize = typeToRegSize(lir.getValueType(context, currentIt.definition), context);
670 			physRegs.markAsUsed(reg);
671 
672 			//	split active interval for reg at position
673 			void splitActiveOrInactive(IntervalIndex index)
674 			{
675 				LiveInterval* activeIt = &live.intervals[index];
676 				// split before current start. Add to unhandled so we get split move registered
677 				version(RAPrint) writefln("      split before current start %s", currentStart);
678 				IntervalIndex newInterval = splitBefore(index, currentStart);
679 
680 				LiveInterval* splitActiveIt = &live.intervals[newInterval];
681 				assignSpillSlot(splitActiveIt);
682 
683 				// if there is a use position in spilled interval, split before use
684 				uint nextActiveUse = splitActiveIt.nextUseAfter(currentStart);
685 				if (nextActiveUse != MAX_USE_POS)
686 				{
687 					version(RAPrint) writefln("      split before use %s", newInterval);
688 					splitBefore(newInterval, nextActiveUse);
689 				}
690 			}
691 
692 			IntervalIndex activeIndex = physRegs[reg].activeInterval;
693 			context.assertf(!activeIndex.isNull, "%s", IrIndexDump(reg, context, lir)); // null means that it is fixed interval. But we filter out fixed interval above
694 			version(RAPrint) writefln("    split active interval %s", activeIndex);
695 			splitActiveOrInactive(activeIndex);
696 
697 			//	split any inactive interval for reg at the end of its lifetime hole
698 			foreach (inactiveId; inactiveVirtual)
699 			{
700 				LiveInterval* it = &live.intervals[inactiveId];
701 				if (sameIndexOrPhysReg(it.reg, reg))
702 				{
703 					version(RAPrint) writefln("    split inactive interval %s", inactiveId);
704 					splitActiveOrInactive(inactiveId);
705 				}
706 			}
707 
708 			// if current intersects with the fixed interval for reg then
709 			if (currentIt.exclusivePosition(physRegs.blockPos[reg.arrayPhysRegIndex]))
710 			{
711 				// split current before this intersection
712 				splitBefore(currentId, physRegs.blockPos[reg.arrayPhysRegIndex]);
713 			}
714 
715 			version(RAPrint) writefln("    spill currently active interval %s before %s for %s",
716 				activeIndex, currentStart, IrIndexDump(reg, context, lir));
717 		}
718 	}
719 
720 	// auto adds new interval to unhandled
721 	IntervalIndex splitBefore(IntervalIndex it, uint before)
722 	{
723 		IntervalIndex newInterval = live.splitBefore(context, lir, it, before);
724 		if (newInterval != it) {
725 			unhandled.insert(context.arrayArena, newInterval, live.intervals[newInterval].from);
726 		}
727 		return newInterval;
728 	}
729 
730 	IntervalIndex splitBefore(IntervalIndex it, uint before, LiveRangeIndex rangeIndex)
731 	{
732 		IntervalIndex newInterval = live.splitBefore(context, it, before, rangeIndex);
733 		if (newInterval != it) {
734 			unhandled.insert(context.arrayArena, newInterval, live.intervals[newInterval].from);
735 		}
736 		return newInterval;
737 	}
738 
739 	void assignSpillSlot(LiveInterval* it)
740 	{
741 		assert(it.definition.isVirtReg);
742 		VregState* state = &vregState[it.definition.storageUintIndex];
743 		if (state.spillSlot.isDefined)
744 		{
745 			// use cached slot
746 			it.reg = state.spillSlot;
747 			version(RAPrint) writefln("assignSpillSlot %s use cached %s", it.definition, state.spillSlot);
748 		}
749 		else
750 		{
751 			IrIndex type = lir.getValueType(context, it.definition);
752 			IrIndex slot = builder.appendStackSlot(type, context.types.typeSizeAndAlignment(type), StackSlotKind.spillSlot);
753 			state.spillSlot = slot; // cache slot
754 			version(RAPrint) writefln("assignSpillSlot %s new slot %s", it.definition, slot);
755 			it.reg = slot;
756 		}
757 		// TODO: need to fix instructions to use stack slots after RA
758 		//       Not relevant yet
759 		//       We always provide register for all uses
760 		//       simple way would be to allow src arguments be stack slots
761 	}
762 
763 	// Replaces all uses of virtual registers with physical registers or stack slots
764 	void fixInstructionArgs(FunctionDeclNode* fun)
765 	{
766 		// fix uses first, because we may copy arg to definition below
767 		foreach (IrIndex vregIndex, ref IrVirtualRegister vreg; lir.virtualRegisters)
768 		{
769 			foreach (IrIndex userIndex, uint numUses; vreg.users.range(lir))
770 			{
771 				final switch (userIndex.kind) with(IrValueKind)
772 				{
773 					case none, array, virtualRegister, physicalRegister, constant, constantAggregate, constantZero, global, basicBlock, stackSlot, type, func: assert(false);
774 					case instruction:
775 						uint pos = live.linearIndices.instr(userIndex);
776 						IrIndex reg = live.getRegFor(vregIndex, pos);
777 						foreach (ref IrIndex arg; lir.getInstr(userIndex).args(lir))
778 							if (arg == vregIndex)
779 							{
780 								arg = reg;
781 							}
782 						break;
783 
784 					case phi:
785 						IrPhi* phi = lir.getPhi(userIndex);
786 						IrIndex[] preds = lir.getBlock(phi.blockIndex).predecessors.data(lir);
787 						foreach (size_t i, ref IrIndex phiArg; phi.args(lir))
788 							if (phiArg == vregIndex)
789 							{
790 								IrIndex lastInstr = lir.getBlock(preds[i]).lastInstr;
791 								uint pos = live.linearIndices.instr(lastInstr);
792 								IrIndex reg = live.getRegFor(vregIndex, pos);
793 								//writefln("set %s arg reg %s -> %s at %s", userIndex, phiArg, reg, pos);
794 								phiArg = reg;
795 							}
796 						break;
797 					case variable: assert(false);
798 				}
799 			}
800 		}
801 	}
802 
803 	void fixInstructionResults(FunctionDeclNode* fun)
804 	{
805 		// fix definitions
806 		// args are already fixed
807 		foreach (IrIndex vregIndex, ref IrVirtualRegister vreg; lir.virtualRegisters)
808 		{
809 			switch(vreg.definition.kind) with(IrValueKind)
810 			{
811 				case instruction:
812 					uint pos = live.linearIndices.instr(vreg.definition);
813 					IrIndex resultReg = live.getRegFor(vregIndex, pos);
814 
815 					IrIndex instrIndex = vreg.definition;
816 					IrInstrHeader* instrHeader = lir.getInstr(instrIndex);
817 					instrHeader.result(lir) = resultReg;
818 
819 					InstrInfo instrInfo = context.machineInfo.instrInfo[instrHeader.op];
820 
821 					// requires two operant form
822 					if (instrInfo.isResultInDst)
823 					{
824 						fixTwoOperandForm(instrInfo, instrIndex, instrHeader);
825 					}
826 					// optimization that can be safely disabled
827 					else
828 					{
829 						if (instrInfo.isMov)
830 						{
831 							IrIndex src = instrHeader.arg(lir, 0);
832 							if (resultReg == src)
833 							{
834 								// redundant mov
835 								//writefln("%s mov %s", resultReg, src);
836 								lir.removeInstruction(instrIndex);
837 							}
838 						}
839 					}
840 					break;
841 
842 				case phi:
843 					IrPhi* irPhi = lir.getPhi(vreg.definition);
844 
845 					// Skip phi functions with no users to avoid multiple writes to the same register.
846 					if (lir.getVirtReg(irPhi.result).users.length == 0) {
847 						break; // do not replace phi result without users, so we can detect and skip those in resolveEdge
848 					}
849 
850 					uint pos = live.linearIndices.basicBlock(irPhi.blockIndex);
851 					IrIndex reg = live.getRegFor(vregIndex, pos);
852 					irPhi.result = reg;
853 					break;
854 
855 				default: assert(false);
856 			}
857 		}
858 	}
859 
860 	void fixTwoOperandForm(InstrInfo instrInfo, IrIndex instrIndex, IrInstrHeader* instrHeader)
861 	{
862 		void makeMov(IrIndex to, IrIndex from)
863 		{
864 			IrArgSize sizeFrom;
865 			switch(from.kind) with(IrValueKind)
866 			{
867 				case stackSlot, global, func, constant, constantZero:
868 					sizeFrom = IrArgSize.size64; // use regular mov
869 					break;
870 				case physicalRegister:
871 					sizeFrom = cast(IrArgSize)from.physRegSize; // movzx for 8/16bit regs
872 					break;
873 				default:
874 					context.internal_error("fixTwoOperandForm %s", from);
875 			}
876 
877 			IrIndex instr;
878 			ExtraInstrArgs extra = {addUsers : false, result : to};
879 			// zero extend 8 and 16 bit args to 32bit
880 			final switch(sizeFrom) with(IrArgSize)
881 			{
882 				case size8:
883 					instr = builder.emitInstr!(Amd64Opcode.movzx_btod)(extra, from).instruction;
884 					break;
885 				case size16:
886 					instr = builder.emitInstr!(Amd64Opcode.movzx_wtod)(extra, from).instruction;
887 					break;
888 				case size32, size64:
889 					instr = builder.emitInstr!(Amd64Opcode.mov)(extra, from).instruction;
890 					break;
891 				case size128, size256, size512: context.internal_error("Invalid constant size %s", sizeFrom);
892 			}
893 			builder.insertBeforeInstr(instrIndex, instr);
894 		}
895 
896 		// Insert mov for instructions requiring two-operand form (like x86 xor)
897 		if (instrHeader.numArgs == 2)
898 		{
899 			// Rewrite
900 			// input: r1 = r2 op r3
901 			// as
902 			// output: r1 = r2
903 			// output: r1 = r1 op r3
904 			//
905 			// if r2 != r3
906 			// {
907 			//     if r1 == r2 { // input: r1 = r1 op r3
908 			//         output: r1 = r1 op r3
909 			//     } else if r1 == r3 { // input: "r1 = r2 op r1"
910 			//         if (op.isCommutative) { // input: "r1 = r3 op r2"
911 			//             output: "r1 = r1 op r2"
912 			//         } else { // input: "r1 = r2 op r1"
913 			//             // error, we cannot swap r2 with r1 here to get r2 = r1 op r2
914 			//             // because r2 will be overwritten. But r2 may be used later
915 			//             // this case is handled inside liveness analysis pass
916 			//         }
917 			//     } else { // input: "r1 = r2 op r3"
918 			//         output: "mov r1 r2"
919 			//         output: "r1 = op r1 r3"
920 			//     }
921 			// }
922 			// else // input: "r1 = op r2 r2"
923 			// {
924 			//     output: "mov r1 r2" if r1 != r2
925 			//     output: "r1 = op r1 r1"
926 			// }
927 
928 			IrIndex r1 = instrHeader.result(lir);
929 			IrIndex r2 = instrHeader.arg(lir, 0);
930 			IrIndex r3 = instrHeader.arg(lir, 1);
931 
932 			if ( !sameIndexOrPhysReg(r2, r3) ) // r2 != r3
933 			{
934 				if ( sameIndexOrPhysReg(r1, r2) ) // r1 == r2
935 				{
936 					// "r1 = op r1 r3", noop
937 				}
938 				else if ( sameIndexOrPhysReg(r1, r3) ) // r1 = op r2 r1
939 				{
940 					if (instrInfo.isCommutative) // r1 = op r1 r2
941 					{
942 						instrHeader.arg(lir, 0) = r1;
943 						instrHeader.arg(lir, 1) = r2;
944 					}
945 					else
946 					{
947 						//InstrInfo instrInfo2 = context.machineInfo.instrInfo[instrHeader.op];
948 						writefln("%s %s %s %s %s", cast(Amd64Opcode)instrHeader.op, r1, r2, r3, instrInfo.isCommutative);
949 						context.internal_error("Unhandled non-commutative instruction in RA, %s %s",
950 							context.idString(lir.name), instrIndex);
951 					}
952 				}
953 				else // r1 = op r2 r3; all different
954 				{
955 					// mov r1 r2
956 					makeMov(r1, r2);
957 					// r1 = op r1 r3
958 					instrHeader.arg(lir, 0) = r1;
959 				}
960 			}
961 			else // r2 == r3
962 			{
963 				if ( !sameIndexOrPhysReg(r1, r2) )
964 				{
965 					// mov r1 r2
966 					makeMov(r1, r2);
967 				}
968 				// r1 = op r1 r1
969 				instrHeader.arg(lir, 0) = r1;
970 			}
971 
972 			// validation
973 			context.assertf(sameIndexOrPhysReg(instrHeader.result(lir), instrHeader.arg(lir, 0)),
974 				"two-operand form not ensured res(%s) != arg0(%s)", IrIndexDump(instrHeader.result(lir), context, lir), IrIndexDump(instrHeader.arg(lir, 0), context, lir));
975 		}
976 		else if (instrHeader.numArgs == 1)
977 		{
978 			// Rewrite
979 			// input: r1 = op r2
980 			// as
981 			// output: r1 = r2
982 			// output: r1 = op r1
983 			//
984 			IrIndex r1 = instrHeader.result(lir);
985 			IrIndex r2 = instrHeader.arg(lir, 0);
986 
987 			if ( !sameIndexOrPhysReg(r1, r2) )
988 			{
989 				makeMov(r1, r2);
990 			}
991 			instrHeader.arg(lir, 0) = r1;
992 		}
993 	}
994 
995 	/*
996 	// Resolve
997 	for each control flow edge from predecessor to successor do
998 		for each interval it live at begin of successor do
999 			if it starts at begin of successor then
1000 				phi = phi function defining it
1001 				opd = phi.inputOf(predecessor)
1002 				if opd is a constant then
1003 					moveFrom = opd
1004 				else
1005 					moveFrom = location of intervals[opd] at end of predecessor
1006 			else
1007 				moveFrom = location of it at end of predecessor
1008 			moveTo = location of it at begin of successor
1009 			if moveFrom ≠ moveTo then
1010 				mapping.add(moveFrom, moveTo)
1011 		mapping.orderAndInsertMoves()
1012 	*/
1013 	// Insert moves between sub-intervals on control flow edges where different sub-intervals meet
1014 	void resolveControlFlow(ref MoveSolver moveSolver)
1015 	{
1016 		version(RAPrint_resolve) writefln("resolve");
1017 
1018 		foreach (IrIndex succIndex, ref IrBasicBlock succBlock; lir.blocks)
1019 		{
1020 			foreach (IrIndex predIndex; succBlock.predecessors.range(lir))
1021 			{
1022 				IrBasicBlock* predBlock = lir.getBlock(predIndex);
1023 				resolveEdge(moveSolver, predIndex, *predBlock, succIndex, succBlock);
1024 			}
1025 		}
1026 	}
1027 
1028 	IrArgSize getMoveArgSize(IrIndex value)
1029 	{
1030 		if (value.isPhysReg) return cast(IrArgSize)value.physRegSize;
1031 		IrIndex type = getValueType(value, lir, context);
1032 		if (value.isStackSlot) type = context.types.getPointerBaseType(type);
1033 		return sizeToIrArgSize(context.types.typeSize(type), context);
1034 	}
1035 
1036 	void resolveEdge(
1037 		ref MoveSolver moveSolver,
1038 		IrIndex predIndex, ref IrBasicBlock predBlock,
1039 		IrIndex succIndex, ref IrBasicBlock succBlock)
1040 	{
1041 		// those are already handled at split site
1042 		// we have no linearIndices for new blocks
1043 		if (predBlock.replacesCriticalEdge || succBlock.replacesCriticalEdge) return;
1044 
1045 		uint succPos = live.linearIndices.basicBlock(succIndex);
1046 		uint predPos = live.linearIndices.instr(predBlock.lastInstr);
1047 
1048 		moveSolver.reset();
1049 
1050 		size_t[] succLiveIn = live.bitmap.blockLiveInBuckets(succIndex);
1051 
1052 		//
1053 		version(RAPrint_resolve) writefln("  edge %s -> %s", predIndex, succIndex);
1054 		foreach(size_t index; succLiveIn.bitsSet)
1055 		{
1056 			LiveInterval* interval = live.vint(index); // always returns first part of split intervals
1057 			if (!interval.isSplit) continue; // skip intervals that weren't split
1058 
1059 			IrIndex vreg = interval.definition;
1060 			version(RAPrint_resolve) writefln("    %s %s -> %s", vreg, succPos, predPos);
1061 			IrIndex succLoc = live.getRegFor(vreg, succPos);
1062 			IrIndex predLoc = live.getRegFor(vreg, predPos);
1063 			if (!predLoc.isDefined) continue; // inteval doesn't exist in pred
1064 
1065 			if (succLoc != predLoc)
1066 			{
1067 				IrArgSize argSize = getMoveArgSize(vreg);
1068 				version(RAPrint_resolve) writefln("    vreg %s, pred %s succ %s size %s", vreg, IrIndexDump(predLoc, context, lir), IrIndexDump(succLoc, context, lir), argSize);
1069 				moveSolver.addMove(predLoc, succLoc, argSize, FromValue.no);
1070 			}
1071 		}
1072 
1073 		foreach (IrIndex phiIndex, ref IrPhi phi; succBlock.phis(lir))
1074 		{
1075 			version(RAPrint_resolve) writef("    phi %s res %s", phiIndex, IrIndexDump(phi.result, context, lir));
1076 
1077 			// Skip phi functions with no users to avoid multiple writes to the same register.
1078 			// We do not replace results for userless phis in fixInstructionResults
1079 			if (phi.result.isVirtReg) {
1080 				auto numUsers = lir.getVirtReg(phi.result).users.length;
1081 				context.assertf(numUsers == 0, "phi is expected to have 0 users, but has %s users", numUsers);
1082 				continue;
1083 			}
1084 
1085 			IrIndex[] preds = lir.getBlock(phi.blockIndex).predecessors.data(lir);
1086 			foreach (size_t arg_i, ref IrIndex arg; phi.args(lir))
1087 			{
1088 				IrIndex argBlock = preds[arg_i];
1089 				if (argBlock == predIndex)
1090 				{
1091 					IrIndex moveFrom = arg;
1092 					IrIndex moveTo = phi.result;
1093 
1094 					IrArgSize argSize = getMoveArgSize(phi.result);
1095 					version(RAPrint_resolve) writefln(" arg %s %s size %s", argBlock, IrIndexDump(arg, context, lir), argSize);
1096 
1097 					FromValue fromValue = FromValue.no;
1098 					if (moveFrom.isStackSlot) {
1099 						if (lir.getStackSlot(moveFrom).kind != StackSlotKind.spillSlot) {
1100 							fromValue = FromValue.yes;
1101 						}
1102 					}
1103 					moveSolver.addMove(moveFrom, moveTo, argSize, fromValue);
1104 				}
1105 			}
1106 		}
1107 
1108 		if (moveSolver.numWrittenNodes == 0) return;
1109 
1110 		if (isCriticalEdge(predBlock, succBlock))
1111 		{
1112 			IrIndex movesTarget = splitCriticalEdge(predIndex, predBlock, succIndex, succBlock);
1113 			// add to the back
1114 			moveSolver.placeMovesBeforeInstr(builder, lir.getBlock(movesTarget).lastInstr, &getScratchSpillSlot);
1115 		}
1116 		else if (predBlock.successors.length > 1)
1117 		{
1118 			IrIndex movesTarget = succIndex;
1119 			// add to the front
1120 			moveSolver.placeMovesBeforeInstr(builder, lir.getBlock(movesTarget).firstInstr, &getScratchSpillSlot);
1121 		}
1122 		else
1123 		{
1124 			context.assertf(predBlock.successors.length == 1,
1125 				"predBlock.successors.length %s", predBlock.successors.length);
1126 			IrIndex movesTarget = predIndex;
1127 			// add to the back
1128 			moveSolver.placeMovesBeforeInstr(builder, lir.getBlock(movesTarget).lastInstr, &getScratchSpillSlot);
1129 		}
1130 	}
1131 
1132 	/// Critical edge is edge between predecessor and successor
1133 	/// where predecessor has more that 1 successor and
1134 	/// successor has more than 1 predecessor
1135 	/// Splitting of this edge is done by inserting new basic block on the edge
1136 	/// This new block is then returned
1137 	/// successor list of predecessor is updated as well as predecessor list of successor
1138 	/// Phi functions are not updated
1139 	IrIndex splitCriticalEdge(
1140 		IrIndex predIndex, ref IrBasicBlock predBlock,
1141 		IrIndex succIndex, ref IrBasicBlock succBlock)
1142 	{
1143 		// place block right after precessor to get better codegen
1144 		// TODO: may need to invert branch conditions for that
1145 		IrIndex newBlock = builder.appendBasicBlockSlot;
1146 		moveBlockAfter(lir, newBlock, predIndex);
1147 
1148 		version(RAPrint_resolve) writefln("Split critical edge %s -> %s with %s", predIndex, succIndex, newBlock);
1149 		predBlock.successors.replaceFirst(lir, succIndex, newBlock);
1150 		succBlock.predecessors.replaceFirst(lir, predIndex, newBlock);
1151 
1152 		lir.getBlock(newBlock).predecessors.append(builder, predIndex);
1153 		lir.getBlock(newBlock).successors.append(builder, succIndex);
1154 		lir.getBlock(newBlock).isSealed = true;
1155 		builder.emitInstr!(Amd64Opcode.jmp)(newBlock);
1156 		lir.getBlock(newBlock).isFinished = true;
1157 		lir.getBlock(newBlock).replacesCriticalEdge = true;
1158 		return newBlock;
1159 	}
1160 
1161 	// Insert moves between sub-intervals in the middle of basic blocks
1162 	void resolveInsertSplitMoves(ref MoveSolver moveSolver)
1163 	{
1164 		moveSolver.reset();
1165 		uint prevPos = 0;
1166 		version(RAPrint_resolve) writefln("Fix splits");
1167 
1168 		void insertPrevMoves() {
1169 			if (prevPos == 0) return;
1170 			if (moveSolver.numWrittenNodes == 0) return;
1171 
1172 			IrIndex insertBeforeInstr = live.evenIndexToIrIndex[prevPos/2];
1173 			context.assertf(insertBeforeInstr.isInstruction, "%s %s", prevPos, insertBeforeInstr);
1174 			version(RAPrint_resolve) writefln("   insert %s moves before %s at %s", moveSolver.numWrittenNodes, insertBeforeInstr, prevPos);
1175 			moveSolver.placeMovesBeforeInstr(builder, insertBeforeInstr, &getScratchSpillSlot);
1176 			moveSolver.reset();
1177 		}
1178 
1179 		//IrIndex movesTarget = predIndex;
1180 		foreach (IntervalIndex id; pendingMoveSplits)
1181 		{
1182 			LiveInterval* it = &live.intervals[id]; // always returns first part of split intervals
1183 			uint splitPos = it.from+1;
1184 			assert((splitPos & 1) == 0, "Must be even pos");
1185 			// insert all moves on prev position
1186 			if (prevPos != splitPos) insertPrevMoves;
1187 
1188 			LiveInterval* parentIt = &live.intervals[it.parent];
1189 			version(RAPrint_resolve) writefln("  %s:%s split at %s, parent %s at %s", id, it.definition, it.from, it.parent, parentIt.to);
1190 			context.assertf(prevPos <= splitPos, "Wrong order %s < %s", prevPos, splitPos);
1191 			IrIndex moveFrom = parentIt.reg;
1192 			IrIndex moveTo = it.reg;
1193 			version(RAPrint_resolve) writefln("   from %s to %s", IrIndexDump(moveFrom, context, lir), IrIndexDump(moveTo, context, lir));
1194 			if (moveFrom != moveTo)
1195 			{
1196 				IrIndex vreg = it.definition;
1197 				IrArgSize argSize = getMoveArgSize(vreg);
1198 				moveSolver.addMove(moveFrom, moveTo, argSize, FromValue.no);
1199 				prevPos = splitPos;
1200 			}
1201 		}
1202 		insertPrevMoves;
1203 	}
1204 
1205 	void genSaveCalleeSavedRegs()
1206 	{
1207 		IrIndex entryBlock = lir.entryBasicBlock;
1208 		IrIndex exitBlock = lir.exitBasicBlock;
1209 		CallConv* callConv = lir.getCallConv(context);
1210 		foreach(PhysReg reg; physRegs.calleeSavedRegs & physRegs.usedRegs)
1211 		{
1212 			// choose correct slot type
1213 			IrArgSize size = callConv.calleeSavedSizePerClass[reg.regClass];
1214 			IrIndex slotType;
1215 			switch(size) with(IrArgSize) {
1216 				case size64:
1217 					slotType = makeIrType(IrBasicType.i64);
1218 					break;
1219 				case size128:
1220 					slotType = context.v128Type;
1221 					break;
1222 				default: context.internal_error("Size not implemented %s", size);
1223 			}
1224 			IrIndex slot = builder.appendStackSlot(slotType, context.types.typeSizeAndAlignment(slotType), StackSlotKind.regSaveSlot);
1225 
1226 			// save register
1227 			ExtraInstrArgs extra1 = { argSize : size };
1228 			IrIndex instrStore = builder.emitInstr!(Amd64Opcode.store)(extra1, slot, IrIndex(reg, size));
1229 			builder.prependBlockInstr(entryBlock, instrStore);
1230 
1231 			// restore register
1232 			ExtraInstrArgs extra = { result : IrIndex(reg, size), argSize : size };
1233 			InstrWithResult instrLoad = builder.emitInstr!(Amd64Opcode.load)(extra, slot);
1234 			builder.insertBeforeLastInstr(exitBlock, instrLoad.instruction);
1235 		}
1236 	}
1237 
1238 	IrIndex getScratchSpillSlot() {
1239 		if (scratchSpillSlot.isUndefined) {
1240 			scratchSpillSlot = builder.appendStackSlot(makeIrType(IrBasicType.i64), context.types.typeSizeAndAlignment(makeIrType(IrBasicType.i64)), StackSlotKind.local);
1241 		}
1242 		return scratchSpillSlot;
1243 	}
1244 }
1245 
1246 struct IntervalPriority
1247 {
1248 	uint from;
1249 	IntervalIndex interval;
1250 }
1251 
1252 // Tweaked to prefer sequential inserts
1253 struct IntervalPriorityQueue
1254 {
1255 	Array!IntervalPriority queue;
1256 	uint maxFrom = 0;
1257 	uint numRemoved = 0;
1258 
1259 	void clear()
1260 	{
1261 		queue.clear;
1262 		maxFrom = 0;
1263 		numRemoved = 0;
1264 	}
1265 
1266 	void free(ref ArrayArena arrayArena)
1267 	{
1268 		queue.free(arrayArena);
1269 	}
1270 
1271 	void insert(ref ArrayArena arrayArena, IntervalIndex interval, uint from)
1272 	{
1273 		if (from >= maxFrom)
1274 		{
1275 			// fast path for append (happens most of the time)
1276 			queue.put(arrayArena, IntervalPriority(from, interval));
1277 			maxFrom = from;
1278 		}
1279 		else
1280 		{
1281 			for(size_t i = numRemoved; i < queue.length; ++i)
1282 			{
1283 				if (queue[i].from > from)
1284 				{
1285 					// insert before i
1286 					if (numRemoved > 0)
1287 					{
1288 						// shift to the left using one of removed slots
1289 						for(size_t j = numRemoved; j < i; ++j)
1290 							queue[j-1] = queue[j];
1291 						--numRemoved;
1292 						queue[i-1] = IntervalPriority(from, interval);
1293 					}
1294 					else
1295 					{
1296 						// shift to the right
1297 						queue.putAt(arrayArena, i, IntervalPriority(from, interval));
1298 					}
1299 					break;
1300 				}
1301 			}
1302 		}
1303 	}
1304 
1305 	IntervalIndex removeFront()
1306 	{
1307 		auto res = queue[numRemoved].interval;
1308 		++numRemoved;
1309 		return res;
1310 	}
1311 
1312 	bool empty() { return numRemoved == queue.length; }
1313 	uint length() { return queue.length - numRemoved; }
1314 }