Start GCC Porting
- add target prefix in config.sub
- add cpu_type ,tm_file and extra_part in gcc/config.gcc
- tm_file: target make file including header file and t-{target}
- t-{target} specify how to make target specify function and library
- extart_part: extra object file for the target compiler
- tm_file: target make file including header file and t-{target}
- Build cc1 only for testing
CUR_DIR=`pwd` $CUR_DIR/../../configure \ --target=nds32le-elf \ --prefix=$CUR_DIR/../../toolchain \ --disable-nls \ --disable-shared \ --enable-languages=c \ --enable-__cxa_atexit\ --enable-c99 \ --enable-long-long \ --enable-threads=single \ --with-newlib \ --disable-lto \ --disable-multilib \ --disable-libssp \ --disable-libgomp \ --disable-decimal-float \ --disable-libffi \ --disable-libmudflap \ --with-gmp=/home/cychen/source/host-tools \ --with-mpfr=/home/cychen/source/host-tools \ --with-mpc=/home/cychen/source/host-tools \ --with-ppl=/home/cychen/source/host-tools \ --with-cloog=/home/cychen/source/host-tools \ --with-host-libstdcxx='-lstdc++ -lm' make all-gcc LDFLAGS=-static CFLAGS='-g3 -O0' make install-gcc
Storage Layout relative macro
Define this macro to have the value 1 if the most significant bit in a byte has the lowest number #define BITS_BIG_ENDIAN
Define this macro to have the value 1 if the most significant byte in a word has the lowest number #define BYTES_BIG_ENDIAN
Define this macro to have the value 1 if, in a multiword object, the most significant word has the lowest number #define WORDS_BIG_ENDIAN
/* Width of a word, in units (bytes). */ #define UNITS_PER_WORD 4
/* Define this macro if it is advisable to hold scalars in registers in a wider mode than that declared by the program. In such cases, the value is constrained to be within the bounds of the declared type, but kept valid in the wider mode. The signedness of the extension may differ from that of the type. */ #define PROMOTE_MODE(MODE,UNSIGNEDP,TYPE) \ if (GET_MODE_CLASS (MODE) == MODE_INT \ && GET_MODE_SIZE (MODE) < UNITS_PER_WORD) \ { \ (MODE) = SImode; \ }
- promote scalar value in registers from narrower mode to SImode
Normal alignment required for function parameters on the stack, in bits #define PARM_BOUNDARY 32
Define this macro to the minimum alignment enforced by hardware for the stack pointer on this machine #define STACK_BOUNDARY 32/64
Alignment required for a function entry point, in bits. #define FUNCTION_BOUNDARY 32
Biggest alignment that any data type can require on this machine, in bits #define BIGGEST_ALIGNMENT 32/64
/* Alignment of field after `int : 0' in a structure. */ #define EMPTY_FIELD_BOUNDARY 32
/* A bit-field declared as `int' forces `int' alignment for the struct. */ #define PCC_BITFIELD_TYPE_MATTERS 1
/* Every structure's size must be a multiple of this. */ #define STRUCTURE_SIZE_BOUNDARY 8
Define this macro to be the value 1 if instructions will fail to work if given data not on the nominal alignment #define STRICT_ALIGNMENT
/* The best alignment to use in cases where we have a choice. */ #define FASTEST_ALIGNMENT 32
/* Make strings word-aligned so strcpy from constants will be faster. */ #define CONSTANT_ALIGNMENT(EXP, ALIGN) \ ((TREE_CODE (EXP) == STRING_CST \ && (ALIGN) < FASTEST_ALIGNMENT) \ ? FASTEST_ALIGNMENT : (ALIGN))
/* Make arrays of chars word-aligned for the same reasons. */ #define DATA_ALIGNMENT(TYPE, ALIGN) \ (TREE_CODE (TYPE) == ARRAY_TYPE \ && TYPE_MODE (TREE_TYPE (TYPE)) == QImode \ && (ALIGN) < FASTEST_ALIGNMENT ? FASTEST_ALIGNMENT : (ALIGN))
/* Layout of source language data types. */ #define SHORT_TYPE_SIZE 16 #define INT_TYPE_SIZE 32 #define LONG_TYPE_SIZE 32 #define LONG_LONG_TYPE_SIZE 64 #define FLOAT_TYPE_SIZE 32 #define DOUBLE_TYPE_SIZE 64 #define LONG_DOUBLE_TYPE_SIZE 64 /* Define this as 1 if `char' should by default be signed; else as 0. */ #define DEFAULT_SIGNED_CHAR 1 #define SIZE_TYPE "long unsigned int" #define PTRDIFF_TYPE "long int" #define WCHAR_TYPE "short unsigned int" #define WCHAR_TYPE_SIZE 16
Stack layout relative macro
/* Define this macro if pushing a word onto the stack moves the stack pointer to a smaller address. */ #define STACK_GROWS_DOWNWARD
/* Define this to nonzero if the nominal address of the stack frame is at the high-address end of the local variables; that is, each additional local variable allocated goes at a more negative offset in the frame. */ #define FRAME_GROWS_DOWNWARD
/* Offset within stack frame to start allocating local variables at. If FRAME_GROWS_DOWNWARD, this is the offset to the END of the first local allocated. Otherwise, it is the offset to the BEGINNING of the first local allocated. */ #define STARTING_FRAME_OFFSET
- Offset from the frame pointer to the first local variable slot to be allocated
/* Offset from the stack pointer register to the first location at which outgoing arguments are placed. */ #define STACK_POINTER_OFFSET
/* Offset from the argument pointer register to the first argument */ #define FIRST_PARM_OFFSET(FNDECL)
- STACK_DYNAMIC_OFFSET
- Offset from the stack pointer register to an item dynamically allocated on the stack, e.g., by alloca
- default after outgoing arguments
- DYNAMIC_CHAIN_ADDRESS
- A C expression whose value is RTL representing the address in a stack frame where the
pointer to the caller’s frame is stored.
/* return address rtx for __builtin_return_address */ #define RETURN_ADDR_RTX nds32_return_addr_rtx
- parameter count means return n-th caller's return address
- in GCC 4.6.2, arm and mips not support count > 0
- count == 0 return the current function return address, which is in register for most of targets.
- Stack layout
---------------- hard fp [ callee saved ] ---------------- pseudo fp [ local vars ] [ outgoing args] ---------------- sp
Basic Characteristics of Registers
/* hardware register number from 0 to a-1 */ #define FIRST_PSEUDO_REGISTER a
/* How to refer to registers in assembler output. This sequence is indexed by compiler's hard-register-number . */ #define REGISTER_NAMES \ {"$r0", "$r1", "$r2", "$r3", "$r4", "$r5", "$r6", "$r7", \ "$r8", "$r9", "$r10","$r11","$r12","$r13","$r14","$r15", \ "$r16","$r17","$r18","$r19","$r20","$r21","$r22","$r23", \ .. }
/* value 1 means not available for register allocation */ #define FIXED_REGISTERS \ { \ /* r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 */ \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ /* r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 */ \ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, \
/* has 1 for each register that is clobbered (in general) by function calls as well (caller save register) */ #define CALL_USED_REGISTERS \ { \ /* r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 */ \ 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, \ /* r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 */ \ 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, \
#define REG_ALLOC_ORDER \ { 5, 4, 3, 2, 1, 0, 6, 7, 8, 9, \ 10, 11, 16, 17, 18, 19, 24, 25, 20, 21, \ 22, 23, 12, 13, 14, 28, 30, 26, 27, 31, \ 29, 32, 33, 34, 35, 36, 37, 15, 38, 39, \ }
enum reg_class { NO_REGS, LO_REGS, HI_REGS, GENERAL_REGS, ... }
#define REG_CLASS_NAMES \ { \ "NO_REGS", \ "LO_REGS", \ "HI_REGS", \ "GENERAL_REGS", \ ... }
#define REG_CLASS_CONTENTS \ { \ { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, /* NO_REGS */ \ { 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, /* LO_REGS : 0-15 */ \ { 0xffff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, /* HI_REGS : 16-31 */ \ ... }
#define IRA_COVER_CLASSES \ { \ GENERAL_REGS,\ LIM_REG_CLASSES \ }
/* Tell IRA to use the order we define rather than messing it up with its own cost calculations. (without the macro the cost of callee save reg will increase while cross function call) */ #define HONOR_REG_ALLOC_ORDER
/* Return number of consecutive hard regs needed starting at reg REGNO to hold something of mode MODE. This is ordinarily the length in words of a value of mode MODE but can be less for certain modes in special long registers. */ #define HARD_REGNO_NREGS(REGNO, MODE) \ ((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
/* return 1 if the regno is ok to content mode value */ #define HARD_REGNO_MODE_OK(regno, mode)
/* A C expression that is nonzero if a value of mode mode1 is accessible in mode mode2 without copying. */ /* Tie QI/HI/SI modes together. */ #define MODES_TIEABLE_P(MODE1, MODE2) \ (GET_MODE_CLASS (MODE1) == MODE_INT \ && GET_MODE_CLASS (MODE2) == MODE_INT \ && GET_MODE_SIZE (MODE1) <= UNITS_PER_WORD \ && GET_MODE_SIZE (MODE2) <= UNITS_PER_WORD)
/* Return class name for regno */ #define REGNO_REG_CLASS(regno) \ (regno <= 15 ? LO_REGS : HI_REGS)
/* Return the maximum number of consecutive registers needed to represent mode MODE in a register of class CLASS. */ #define CLASS_MAX_NREGS(CLASS, MODE) \ ((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
Debugging Support
- INCOMING_RETURN_ADDR_RTX
- the way find return address before prolog to support call frame debug
- If this RTL is a REG, you should also define DWARF_FRAME_RETURN_COLUMN to DWARF_FRAME_REGNUM (REGNO)
- EH_RETURN_DATA_REGNO (N)
- define the register to pass exception handle data
- You must define this macro if you want to support call frame exception handling like
that provided by DWARF 2.
- EH_RETURN_STACKADJ_RTX
- adjustment to be applied before function return. This is used to unwind the stack to
an exception handler’s call frame. It will be assigned zero on code paths that return normally.
- DWARF_FRAME_REGISTERS
- DWARF_FRAME_REGNUM
Instruction pattern in GCC
- There are three main conversions that happen in the compiler:
- The front end reads the source code and builds a parse tree.
- The parse tree is used to generate an RTL insn list based on named instruction patterns.
- The insn list is matched against the RTL templates to produce assembler code.
- GCC定義了一些ISA上常見的instruction RTL pattern稱為named pattern
- 在parsing tree轉換成RTL(register transfer language)的phase,GCC會用named RTL pattern相對應的parsing條件做比對,然後轉換成named RTL pattern
- named RTL pattern 可以用define_insn或define_expand定義
- 在GCC build的過程中會根據RTL pattern在build folder下generate出insn-emit.c
- 指令在都轉換成RTL後,會經過一連串optimization,最後由define_insn match RTL pattern,然後print out assembly code.
- 這時可以用nameless的define_insn pattern做assembly code matching
- nameless的define_insn,為了debug方便,取名為*開頭的pattern name
- 這使GCC知道它是nameless pattern,同時在debug過程中我們也可以知道它是match到哪個rtl pattern
- nameless的pattern不會用在rtl的生成,僅用在assembly code的matching
Here is an actual example of an instruction pattern, for the 68000/68020. (define_insn "tstsi" <= pattern name [(set (cc0) (match_operand:SI 0 "general_operand" "rm"))] "" <= match condition /\ /\ "* || || { predicate constraint if (TARGET_68020 || ! ADDRESS_REG_P (operands[0])) return \"tstl %0\"; <= output assembly code return \"cmpl #0,%0\"; <= %0 means print out operands[0] }")
- (match_operand:m n predicate constraint)
- m: operand mode
- n: operands number
- predicate: operand n matching的條件,唯有滿足這個條件才會match到這個rtl pattern
- constraint: 可以由constraint來設定ISA上這個operands的限制,如果在optimization的過程有產生不符合的RTL,最後會由reload來解決,同時也會影響到register allocation的結果
- predicate必須包含constraint,否則可能reload出不符predicate的pattern使compiler crash
- (match_scratch:m n constraint)
- could use it when the pattern need temporary register
- If the last few expressions in a parallel are clobber expressions whose
operands are either a hard register or match_scratch, the combiner can add or delete them when necessary
- (match_dup n)
- It is used when the operand needs to appear more than once in the insn
- should not use match_dup when the operands is both an input operand and the output operand
- (match_operator:m n predicate [operands...])
- match operator
- There is no way to specify constraints in match_operator. The operand of the insn which corresponds to the match_operator never has any constraints, because it is never reloaded as a whole
- it usually best if the operand number of the match_operator is higher than that of the actual operands of the insn. This improves register allocation because the register allocator often looks at operands 1 and 2 of insns to see if it can do register tying.
- Back trace of expand RTL in GCC
#0 expand_expr_real (exp=0xb7ff3900, target=0x0, tmode=VOIDmode, modifier=EXPAND_NORMAL, alt_rtl=0x0) at /ho me/cychen/source/462/work/build/../../gcc/expr.c:7221 #1 0x081b1a24 in expand_normal (exp=0xb7ff3900) at /home/cychen/source/462/work/build/../../gcc/expr.h:428 #2 0x081ba7b7 in expand_assignment (to=0xb7f61348, from=0xb7e27f60, nontemporal=0 '\000') at /home/cychen/so urce/462/work/build/../../gcc/expr.c:4228 #3 0x081167fc in expand_gimple_stmt_1 (stmt=0xb7e21630) at /home/cychen/source/462/work/build/../../gcc/cfge xpand.c:1975 #4 0x08116b2e in expand_gimple_stmt (stmt=0xb7e21630) at /home/cychen/source/462/work/build/../../gcc/cfgexp and.c:2084 #5 0x0811b1c3 in expand_gimple_basic_block (bb=0xb7f81fc0) at /home/cychen/source/462/work/build/../../gcc/c fgexpand.c:3626 #6 0x0811c0b9 in gimple_expand_cfg () at /home/cychen/source/462/work/build/../../gcc/cfgexpand.c:4109
- The "movm" pattern with constraint "r" must have "m" constraint, or else it have chance to reload fail.
- When generate RTL, operand match only consider predicate.
- When optimization, after code motion will call extract_insn(rtx insn) which in recog.c
- extract_insn will call recog_memoized first to recognize pattern by predicate.
- which eventually will call recog defined in insn-recog.c
- insn-recog.c is generated when building gcc according to *.md file
- which eventually will call recog defined in insn-recog.c
- and then will determine the RTL pattern belong to which alternative by the pattern constraint
- extract_insn will call recog_memoized first to recognize pattern by predicate.
Operand Constraints
- m: memory operands, the letter could re-defined by TARGET_MEM_CONSTRAINT
- o: offsettable memory operand
- V: not-offsettable memory operand
- <: auto-decrement(could be predecremnt or postdecremnt) addressing
- >: auto-increment(could be preincremnt or postincremnt) addressing
- r: register operand is allowed provided that it is in a general register
- i: immediate integer operand (one with constant value) is allowed. This includes symbolic constants
- n: immediate integer operand which less than a word wide
- E, F: immediate floating operand
- G, H: machine-dependent floating operands
- s: CONST_INT || CONST_DOUBLE || VOIDmode , not including symbol
- g: Any register, memory or immediate integer operand is allowed, except for registers that are not general registers
- X: It use when one of the alternative need scratch register but tho other don't
(define_insn "*andsi3_compare0_scratch" [(set (reg:CC_NOOV CC_REGNUM) (compare:CC_NOOV (and:SI (match_operand:SI 0 "s_register_operand" "r,r") (match_operand:SI 1 "arm_not_operand" "rI,K")) (const_int 0))) (clobber (match_scratch:SI 2 "=X,r"))] <== no need scratch in alter 0 "TARGET_32BIT" "@ tst%?\\t%0, %1 bic%.\\t%2, %0, #%B1" [(set_attr "conds" "set")] )
- 0->9: match with operand 0->9
- p: address operand allow by TARGET_LEGITIMATE_ADDRESS_P
Constraint Modifier Characters
- ?: slightly increase the cost of this alternative
- !: severely increase the cost of this alternative, still have change to use if the constraint meet without reloading
- =: output operand
- +: in/output operand
- &: earlyclobber operand, used in output operand when we want output operands not the same with any input operand
- %: Declares the instruction to be commutative, then GCC will try to interchange the two operands to fit the constraint
- should only use in the operator meet commutative law, e.g. add, mul, and.
(define_insn "addhi3" [(set (match_operand:HI 0 "general_operand" "=m,r") (plus:HI (match_operand:HI 1 "general_operand" "%0,0") (match_operand:HI 2 "general_operand" "di,g")))] ...)
- #: Only use the constraint for choosing register preferences
- *: Not use for choosing register preferences and no effect on reloading.
Defining Machine-Specific Predicates and Constraints
- Use define_predicate and define_special_predicate define machine specific predicate
- define_predicate will do mode test and define_special_predicate will not
- mode test: (mode == VOIDmode || GET_MODE (op) == mode);
(define_predicate "arm_float_compare_operand" (if_then_else (match_test "TARGET_VFP") (match_operand 0 "vfp_compare_operand") (match_operand 0 "arm_float_rhs_operand"))) ;; True for valid index operands. (define_predicate "index_operand" (ior (match_operand 0 "s_register_operand") (and (match_operand 0 "immediate_operand") (match_test "(GET_CODE (op) != CONST_INT || (INTVAL (op) < 4096 && INTVAL (op) > -4096))")))) ;; True for operators that can be combined with a shift in ARM state. (define_special_predicate "shiftable_operator" (and (match_code "plus,minus,ior,xor,and") (match_test "mode == GET_MODE (op)")))
- Use define_register_constraint, define_constraint, define_memory_constraint define machine specific constraint
(define_register_constraint "l" "TARGET_THUMB ? LO_REGS : GENERAL_REGS" "In Thumb state the core registers r0->r7.") (define_constraint "M" "In Thumb-1 state a constant that is a multiple of 4 in the range 0-1020." (and (match_code "const_int") (match_test "TARGET_32BIT ? ((ival >= 0 && ival <= 32) || (((ival & (ival - 1)) & 0xFFFFFFFF) == 0)) : ival >= 0 && ival <= 1020 && (ival & 3) == 0"))) (define_memory_constraint "Uv" "@internal In ARM/Thumb-2 state a valid VFP load/store address." (and (match_code "mem") (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, FALSE)")))
- constraint character for what kind constraint have been specify in reload.c: find_reloads()
- generic constraint letters: 'E F V X g i m n o p r s'
- Should not use generic constraint letters to define machine specific constraint
- G, H: CONST_DOUBLE
- I, J, K, L, M, N, O, P: CONST_INT
- generic constraint letters: 'E F V X g i m n o p r s'
GCC 3.x register allocation
- regmove: generate move to satisfy constraint and delete the redundant instructions
- regclass: select preferred reg class and alternative reg class for register allocation
- local allocator: register allocation of live time with in a basic block
- global allocator: register allocation cross blocks (Chow's priority based colouring)
- reload: transfer rtl to satisfy operand constraint
- indirect code selection
- elimination of virtual hardware reg ex: argument pointer
- assign stack slot for spill code
- generate save/restore code for function call
- copy propagation
- simple rematerialization (如果重計算比較划算 就重算不做load/store)
- address inheritance (move2add) 減小 address reference 的offset
- post reload: remove redundent move at the end
GCC 4.x Integrated register allocation
- color priority的計算方式
priority = ( (live time上用到的hard reg數目) * (freq 這條life time會被執行到的傾向) * (log2 被指令使用到的次數) ) / (live time的長度(經過幾個rtl insn))
- IRA是一個regional allocator 也就是會將live time根據region切成幾個allocno 在對每個allocno做allocation
- region的切法 (可由 -fira-region= 設定):
- all (所有loop: IRA_REGION_ALL)
- mixed(所有loop除了register pressure很小的loop: IRA_REGION_MIXED is default value)
- one 一個function 當一個region (IRA_REGION_ONE)
- passes in IRA
- build IR: regions, allocnos, copies, costs
- coloring
- spill/restore points move
- emitting new registers and code fore register shuffling
- caller save optimizations
- rebuild IR
- reload
- back to coloring if need
- IRA可以定義cover class將register分類 然後根據architecture的ISA性質 給cost
- use IRA_COVER to define, and won't need after GCC 4.7
- 可以將allocno限制只分配給他cover class內的reg
- hard-register cost:用一個vector紀錄 他的大小和cover class內的hardware reg數目相同
會利用到 register_move_cost來計算這個vecter
- IRA 會用到的幾個structure
- allocno: 在一個region內的pseudo reg life time
- conflicting cllocnos
- conflicting hard reg
- 當allocate到caller save reg且cross function call時 cost增加 (增加spill的傾向)
- ira-costs.c: ira_tune_allocno_costs_and_cover_classes()
/* Change hard register costs for allocnos which lives through function calls. This is called only when we found all intersected calls during building allocno live ranges. */
if (! ira_hard_reg_not_in_set_p (regno, mode, call_used_reg_set) || HARD_REGNO_CALL_PART_CLOBBERED (regno, mode)) cost += (ALLOCNO_CALL_FREQ (a) * (ira_memory_move_cost[mode][rclass][0] + ira_memory_move_cost[mode][rclass][1])); #ifdef IRA_HARD_REGNO_ADD_COST_MULTIPLIER cost += ((ira_memory_move_cost[mode][rclass][0] + ira_memory_move_cost[mode][rclass][1]) * ALLOCNO_FREQ (a) * IRA_HARD_REGNO_ADD_COST_MULTIPLIER (regno) / 2); #endif
- allocno到callee save reg時,由於prolog/epilog可能需push/push所以增加cost
- ira-color.c: assign_hard_reg
- HONOR_REG_ALLOC_ORDER only exist in gcc 4.6 or after
#ifndef HONOR_REG_ALLOC_ORDER if (! allocated_hardreg_p[hard_regno] && ira_hard_reg_not_in_set_p (hard_regno, mode, call_used_reg_set)) /* We need to save/restore the hard register in epilogue/prologue. Therefore we increase the cost. */ { /* ??? If only part is call clobbered. */ rclass = REGNO_REG_CLASS (hard_regno); add_cost = (ira_memory_move_cost[mode][rclass][0] + ira_memory_move_cost[mode][rclass][1] - 1); cost += add_cost; full_cost += add_cost; } #endif
- conflict hard-register costs:和hard-register cost vector同大小
對於還沒分配到allocno 用這個vecter紀錄目前 get到某個hard reg所需的cost
- 當同一條pseudo reg的某個allocno挑到一個hard reg時 選那個hard reg的cost會降低
使得同一條pseudo reg的其他allocno傾向選同一個hard reg來減少move
- IRA會傾向將output operand和input operand挑同一個reg
- cap: nested region裏個allocno 針對cap的情況做了些判斷
- 一個allocno被稱為"trivially colourable" 只有當 conflict hard reg num + 儲存這個 allocno所需的hard reg數目 < available hard reg in same cover class 也就是說一定可以圖完色
- ira-color.c: put_allocno_into_bucket()
if (ALLOCNO_LEFT_CONFLICTS_NUM (allocno) + ira_reg_class_nregs[cover_class][ALLOCNO_MODE (allocno)] <= ALLOCNO_AVAILABLE_REGS_NUM (allocno)) add_allocno_to_bucket (allocno, &colorable_allocno_bucket);
- IRA分為兩個PASS
- pass1: 將trivially colourable allocno放進stack 然後剩下的再計算cost放進stack
- memory cost - reg cost +
- 0 for cap
- move cost for hard reg
- 減load/store cost for memory
- 是考慮到上層region的allocate情況對cost做改變
- pass1將最少用到的allocno先放進stack 在pass2時就會將 最常用到的allocno先pop出來做allocate (Chaitin-Briggs colouring的重要改變)
- memory cost - reg cost +
- pass2: allocate allocno to a hard reg 沒成功或cost太高則spill到 memory
- 利用下面的cost function決定要allocate哪個hard reg
- cost(allocno, hard reg) = conflictAllocnoCost(allocno, hard reg) + CopyAllocnoCost(allocno, hard reg)
- assign allocno到hard reg的cost = 所有和這個allocno conflict的ConflictCost(allocno, hard reg)
- 利用下面的cost function決定要allocate哪個hard reg
加上 所有和這個allocno相連的allocno的copy cost (用來表達 當assign不同hard reg時額外的move cost)
- 當assign完後和這個allocno相連的allocno的Copy cost會降低 (因為不需要copy嘛 如果選同個reg的話)
- IRA有個special pass來改變是mem 或 reg 目的是避免太內部的loop有太多的load/store
- IRA會assign新的pseudo-register給每個allocno 為的是減少reload發生spill時的影響範圍
因此造成真正的move的概率很低 就算發生也比整個pseudo-register都spill要來的好
- IRA有做caller save optimization來減少load/store
- 在IRA產生save/restore比在reload好
是因為都還是pseudo reg 要做任何簡化都比較容易
- IRA會試着使用同個stack slot來消除memory-memory move.
- reload會負責分配stack slot
會試着共用stack slot但只針對被reload spill出來的pseudo reg
- IRA對stack slot的分配
- 當有FP時 將常用的pseudo先分配到stack slot
- 當只有SP時 將少用的先分配到stack slog
- 消除momery to memory move
- 減小fp+offset和sp+offset的offset
- Register allocation relative article
GCC 4.x IRA select Best class for allocno
- Best class will select from important classes
- important class are the subclasses of cover class
- that is REG_CLASS we define in machine.h and also a subclass of cover class we define in IRA_COVER_CLASSES
- IRA_COVER_CLASSES is remove from GCC 4.7
- Build register class move_cost table for IRA
- use REGISTER_MOVE_COST porting function to build simple one last_move_cost table
- reginfo.c: init_move_cost (mode)
for (i = 0; i < N_REG_CLASSES; i++) if (contains_reg_of_mode[i][m]) for (j = 0; j < N_REG_CLASSES; j++) { int cost; if (!contains_reg_of_mode[j][m]) cost = 65535; else { cost = REGISTER_MOVE_COST (m, i, j); gcc_assert (cost < 65535); } all_match &= (last_move_cost[i][j] == cost); last_move_cost[i][j] = cost; }
- use last_move_cost table build (1) move_cost (2) may_move_in_cost (3) may_move_out_cost
- also in init_move_cost (mode)
cost = last_move_cost[i][j]; for (p2 = ®_class_subclasses[j][0]; *p2 != LIM_REG_CLASSES; p2++) if (*p2 != i && contains_reg_of_mode[*p2][m]) cost = MAX (cost, move_cost[m][i][*p2]); /* cost is the max cost of class i move to all possible subclass of j */ for (p1 = ®_class_subclasses[i][0]; *p1 != LIM_REG_CLASSES; p1++) if (*p1 != j && contains_reg_of_mode[*p1][m]) cost = MAX (cost, move_cost[m][*p1][j]); /* cost is the max cost of from all possible subclass of i move to all possible subclass of j */ gcc_assert (cost <= 65535); move_cost[m][i][j] = cost; if (reg_class_subset_p (i, j)) may_move_in_cost[m][i][j] = 0; /* when move from subclass may_move_in_cost get cost 0 else this encourage input operand select bigger class */ may_move_in_cost[m][i][j] = cost; if (reg_class_subset_p (j, i)) may_move_out_cost[m][i][j] = 0; /* when move to subclass may_move_out cost get cost 0 else this encourage output operand select smaller class */ may_move_out_cost[m][i][j] = cost;
- calculate reg_class move cost for each allocno
- ira-costs.c: find_allocno_class_costs()
/* Find costs of register classes and memory for allocnos and their best costs. */
- which will scan each insn's operand to calculate cost
- call graph would be
find_allocno_class_costs() -> process_bb_node_for_costs () -> scan_one_insn () -> record_operand_costs () -> record_reg_classes ()
- record_reg_classes ()
- process constraint to calculate each cost_class(important class) cost for each operand
case '*': /* the constraint after '*' not influence best class selection */ /* Ignore the next letter for this pass. */ c = *++p; break; case '?': /* increase cost for the constraint after '?' */ alt_cost += 2; case '!': case '#': case '&': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case 'p': allows_addr = 1; win = address_operand (op, GET_MODE (op)); /* We know this operand is an address, so we want it to be allocated to a register that can be the base of an address, i.e. BASE_REG_CLASS. */ classes[i] = ira_reg_class_union[classes[i]] [base_reg_class (VOIDmode, ADDRESS, SCRATCH)]; break; case 'm': case 'o': case 'V': /* It doesn't seem worth distinguishing between offsettable and non-offsettable addresses here. */ insn_allows_mem[i] = allows_mem[i] = 1; if (MEM_P (op)) win = 1; break;
- calculate cost for operand i
struct costs *pp = this_op_costs[i]; for (k = 0; k < cost_classes_num; k++) { rclass = cost_classes[k]; /* rclass is any possible important class */ pp->cost[k] = (((recog_data.operand_type[i] != OP_OUT ? ira_get_may_move_cost (mode, rclass, /* will use may_move_in_cost[] */ classes[i], true) : 0) /* classes[i] is the constraint we fill for operand i */ + (recog_data.operand_type[i] != OP_IN ? ira_get_may_move_cost (mode, classes[i], /* will use may_move_out_cost[] */ rclass, false) : 0)) * frequency); /* frequency present the execution times of basic block */ }
- Final adjust best cost for each operand
op_cost_add = alt_cost * frequency; /* Finally, update the costs with the information we've calculated about this alternative. */ for (i = 0; i < n_ops; i++) if (REG_P (ops[i]) && REGNO (ops[i]) >= FIRST_PSEUDO_REGISTER) { struct costs *pp = op_costs[i], *qq = this_op_costs[i]; int scale = 1 + (recog_data.operand_type[i] == OP_INOUT); pp->mem_cost = MIN (pp->mem_cost, (qq->mem_cost + op_cost_add) * scale); for (k = 0; k < cost_classes_num; k++) pp->cost[k] = MIN (pp->cost[k], (qq->cost[k] + op_cost_add) * scale); }
- definition of ALLOCNO_BAD_SPILL_P
- when op is a pseudo register and insn not allow memory
- allocate hard reg fail will generate spill code
for (i = 0; i < n_ops; i++) { ira_allocno_t a; rtx op = ops[i]; if (! REG_P (op) || REGNO (op) < FIRST_PSEUDO_REGISTER) continue; a = ira_curr_regno_allocno_map [REGNO (op)]; if (! ALLOCNO_BAD_SPILL_P (a) && insn_allows_mem[i] == 0) ALLOCNO_BAD_SPILL_P (a) = true; }
- scan_one_insn()
- accumulate reg_class_cost of each operand to COSTS_OF_ALLOCNO
record_operand_costs (insn, op_costs, allocno_pref); /* Now add the cost for each operand to the total costs for its allocno. */ for (i = 0; i < recog_data.n_operands; i++) if (REG_P (recog_data.operand[i]) && REGNO (recog_data.operand[i]) >= FIRST_PSEUDO_REGISTER) { int regno = REGNO (recog_data.operand[i]); struct costs *p = COSTS_OF_ALLOCNO (allocno_costs, ALLOCNO_NUM (ira_curr_regno_allocno_map[regno])); struct costs *q = op_costs[i]; p->mem_cost += q->mem_cost; for (k = 0; k < cost_classes_num; k++) p->cost[k] += q->cost[k]; }
GCC 3.x Tail call optimization
- calls.c: expand_call () will fill "CALL PLACEHOLDER"
- which will contain three versions of RTL instructions to represent the current function call
- normal call
- tail recursion
- sibling call
- which will contain three versions of RTL instructions to represent the current function call
- code generation
- The caller stores the callee’s arguments in the area for its own incoming arguments. (By definition, this is not part of the epilogue, but without this step the following instructions would not make sense.)
- The caller has to pop callee saved registers, should it be required
- If not given the option -fomit-frame-pointer, the caller restores the frame and base pointer, by reloading both with their prior values.
- Instead of finishing with a generic function call, the callee is invoked via the architecture’s low level jump command to reuse the existing stack frame along with the stack slots reserved for incoming arguments.
- limitations
- No pending clean ups
- Matching argument sizes of caller and callee
- No variable argument functions
- Sibling call epilogue must be defined
- No struct returns
- Caller and callee must have matching return types
- No setjmp and longjmp
- No volatile functions
- The caller’s stack frame must be empty
- No indirect calls
- No position independent code
- No overlapping arguments
- (Partly) Machine-dependent constraints
Sequence point
*p++ = *p + 10;
- cause *p have higher execution priority
- lvalue get *p address
- Howeve, rvalue (*p+10) is execute before *p++ or after ?
- in fact, "=" is not a sequence point
- therefore, the execution prority depend on the implementation of the compiler
- GCC will execute (*p + 10) first
- OPEN64 will execute (*p++) first
- therefore, the execution prority depend on the implementation of the compiler
- in fact, "=" is not a sequence point
Volatile reserve word in C make the statement to be a "sequence point"
Addressing modes
- have pre/post/inc/dec or not
HAVE_PRE_INCREMENT HAVE_PRE_DECREMENT HAVE_POST_INCREMENT HAVE_POST_DECREMENT
- have pre/post modify with a "constant" or not
HAVE_PRE_MODIFY_DISP HAVE_POST_MODIFY_DISP
- have pre/post modify with a "reg" or not
HAVE_PRE_MODIFY_REG HAVE_POST_MODIFY_REG
- CONSTANT_ADDRESS_P (x): a valid constant address or not
- CONSTANT_P (x): is symbol_ref, label_ref, high expressions, const arithmetic expressions, const_int , const_double expressions or not.
- MAX_REGS_PER_ADDRESS: max reg number in one valid memory address
- TARGET_LEGITIMATE_ADDRESS_P: a valid memory address or not
bool TARGET_LEGITIMATE_ADDRESS_P (enum machine mode mode, rtx x, bool strict)
- strict being true means already do register allocation
- use the array reg_renumber[pseudo regno]=hard regno or -1 for memory
- use #ifdef REG_OK_STRICT conditional in source file to define the strict variant in that case
- TARGET_LEGITIMIZE_ADDRESS: return a valid address when original address is invalid
rtx TARGET_LEGITIMIZE_ADDRESS (rtx x, rtx oldx, enum machine mode mode)
- The compiler has standard ways of doing so in all invalid cases.
- But often a "machine-dependent strategy" can generate better code
- LEGITIMIZE_ADDRESS will be called in explow.c: memory_address()
/* Return something equivalent to X but valid as a memory address for something of mode MODE. When X is not itself valid, this works by copying X or subexpressions of it into registers. */ rtx memory_address (enum machine_mode mode, rtx x) (gcc 4.4.4) memory_address_addr_space (enum machine_mode mode, rtx x, addr_space_t as) (gcc 4.6.2)
- memory_address will do
- convert address to Pmode (why?)
- force constant address to register for better cse chance
- call break_out_memory_refs ()
- force constant_address to reg and simplify the address calculation.
- call LEGITIMIZE_ADDRESS
- Symplify to (reg/symbol + const) format
- force to reg if MULT or MINUS
/* Return a copy of X in which all memory references and all constants that involve symbol refs have been replaced with new temporary registers. Also emit code to load the memory locations and constants into those registers. If X contains no such constants or memory references, X itself (not a copy) is returned. If a constant is found in the address that is not a legitimate constant in an insn, it is left alone in the hope that it might be valid in the address. X may contain no arithmetic except addition, subtraction and multiplication. Values returned by expand_expr with 1 for sum_ok fit this constraint. */ static rtx break_out_memory_refs (rtx x)
- what arm gcc do in LEGITIMIZE_ADDRESS
- transfer legitimize_tls_address to valid
- transfer base + big const => base + valid const
- transfer invalid big const => base + valid const
Naming pattern
- cstore<mode>4
- Store zero or nonzero in operand 0 according to whether a comparison is true
- Operand 0: comparison result is true or false
- Operand 1: comparison operator.
- Operand 2 and operand 3: first and second operand of the comparison.
- Store zero or nonzero in operand 0 according to whether a comparison is true
- cbranch<mode>4
- Conditional branch instruction combined with a compare instruction
- Operand 0:comparison operator
- Operand 1 and operand 2: first and second operands of the comparison
- Operand 3: label_ref that refers to the label to jump to
- Conditional branch instruction combined with a compare instruction
- call
- Subroutine call instruction returning no value.
- Operand 0: function to call
- Operand 1: number of bytes of arguments pushed as a const_int
- Operand 2: number of registers used as operands
- Subroutine call instruction returning no value.
- call_value
- Subroutine call instruction returning a value
- Operand 0: hard register in which the value is returned
- Other three operands just like "call"
- Subroutine call instruction returning a value
PC relative jumptable
/* Define this macro to be an expression with a nonzero value if jump tables (for tablejump insns) should be output in the text section, along with the assembler instructions. Otherwise, the readonly data section is used. */ #define JUMP_TABLES_IN_TEXT_SECTION 1
- Use "casesi" to implement dispatch table
- Instruction to jump through a dispatch table, including bounds checking
- Operand 0: The index to dispatch on, which has mode SImode
- Operand 1: The lower bound for the table, an integer constant
- Operand 2: The total range of the table (range = max index - min index)
- Operand 3: label of the jumptable
- Operand 4: label for jump out jumptable when condition not match
- Instruction to jump through a dispatch table, including bounds checking
- Condition of jumptable generation
- in stmt.c: expand_case()
- when the condition not satisfied which is the case index is too sparse
- GCC will generate binary search tree for switch statement
If (count >= case_values_threshold () && (range <= (optimize_insn_for_size_p () ? 3 : 10) * count)) { Generate jump_table } count: total switch number range: max_case_index – min_case_index
- Example
EX: count = 5 (there are 5 case) Range = 14 – 10 = 4 Switch(i) { case 10: c = a + b; break; case 11: c = a - b; break; case 12: c = a * b; break; case 13: c = a / b; break; case 14: c = a & b; break; default: }
- GCC default case_values_threshold () is 4 when "casesi" pattern is implement
- otherwise it would be 5
- could use TARGET_CASE_VALUES_THRESHOLD change it
- optimize_insn_for_size_p () will be true when -Os is set
- Therefore, when –Os condition would be
If (count >= 4 && (range <= 3 * count)) { Generate jump_table }
- Jumptable generation source code
else if (count < targetm.case_values_threshold () || compare_tree_int (range, (optimize_insn_for_size_p () ? 3 : 10) * count) > 0 /* RANGE may be signed, and really large ranges will show up as negative numbers. */ || compare_tree_int (range, 0) < 0 #ifndef ASM_OUTPUT_ADDR_DIFF_ELT || flag_pic #endif || !flag_jump_tables || TREE_CONSTANT (index_expr) /* If neither casesi or tablejump is available, we can only go this way. */ || (!HAVE_casesi && !HAVE_tablejump)) {
- ASM_OUTPUT_ADDR_DIFF_ELT (stream, body, value, rel)
- generate dispatch table entry as relative address (label difference .L-.L)
- when ASM_OUTPUT_ADDR_DIFF_ELT not defined, -fpic will not generate jumptable
- because GCC will use ASM_OUTPUT_ADDR_VEC_ELT generate dispatch table entry
- which will be absolute address (.L)
- because GCC will use ASM_OUTPUT_ADDR_VEC_ELT generate dispatch table entry
- Define CASE_VECTOR_PC_RELATIVE will use ASM_OUTPUT_ADDR_DIFF_ELT generate tabe entry
- which will generate PC relative jump table
- The advantage of PC relative jump table
- the table entry contain the difference beteen table label and case label
- when table label and max case label is close enough
- we could change the table entry mode as QImode to reduce the table size
- To do so need define CASE_VECTOR_SHORTEN_MODE(min, max, body)
- which operands are the min case label address and min case label address
- it calculate by the instruction attribute length defined in md file
- use this macro to determine the mode of table entry
- ASM_OUTPUT_ADDR_DIFF_ELT will get the mode and generate right format
- which operands are the min case label address and min case label address
- When the table entry may change mode, the alignment issue should be consider
- use ADDR_VEC_ALIGN to determine the jumptable alignment
- use ASM_OUTPUT_CASE_END to determine the alignment after jumptable
Passing Function Arguments
- ACCUMULATE_OUTGOING_ARGS
/* Define this if the maximum size of all the outgoing args is to be accumulated and pushed during the prologue. The amount can be found in the variable crtl->outgoing_args_size. */
- REG_PARM_STACK_SPACE
- define this macro will preserve fixed stack size for arguments, even if passed in registers.
- OUTGOING_REG_PARM_STACK_SPACE
- define this macro when preserve fixed stack size for arguments is allocated by caller
- TARGET_RETURN_POPS_ARGS (caller pop argument stack or callee pop)
- returns the number of bytes of its own arguments that a function pops on returning
- or return 0 if the function pops no arguments and the caller must therefore pop them all after the function returns
- TARGET_FUNCTION_ARG
- return reg rtx to indicate the argument passed by which register
- or return 0 indicate passed on the stack
- TARGET_FUNCTION_ARG_ADVANCE
- use to update the current argument register usage in structure CUMULATIVE_ARGS
- TARGET_FUNCTION_ARG_BOUNDARY
- returns the alignment boundary, in bits, of an argument with the specified mode and type
- default hook returns PARM_BOUNDARY for all arguments
- TARGET_ARG_PARTIAL_BYTES
- returns the number of bytes at the beginning of an argument that must be put in registers
- return 0 for arguments that are passed entirely in registers or on the stack
- FUNCTION_ARG_REGNO_P
- return nonzero if regno is the number of a hard register in which function arguments are sometimes passed
- TARGET_PASS_BY_REFERENCE
- return true if an argument at the position indicated by cum should be passed by reference
Frame Pointer Need
- when frame pointer is needed
- No omit frame pointer flag
- the function would call alloca and then modify stack frame
- enable stack checking flag
- accesses_prior_frames
- stack need realignment
- target defined fp required in TARGET_FRAME_POINTER_REQUIRED
- defined in ira.c:ira_setup_eliminable_regset()
/* FIXME: If EXIT_IGNORE_STACK is set, we will not save and restore sp for alloca. So we can't eliminate the frame pointer in that case. At some point, we should improve this by emitting the sp-adjusting insns for this case. */ int need_fp = (! flag_omit_frame_pointer || (cfun->calls_alloca && EXIT_IGNORE_STACK) /* We need the frame pointer to catch stack overflow exceptions if the stack pointer is moving. */ || (flag_stack_check && STACK_CHECK_MOVING_SP) || crtl->accesses_prior_frames || crtl->stack_realign_needed || targetm.frame_pointer_required ()); frame_pointer_needed = need_fp;
/* Nonzero if function being compiled called builtin_return_addr or builtin_frame_address with nonzero count. */ bool accesses_prior_frames;
- in reload pass will try to eliminate fp by the porting target hook
- TARGET_CAN_ELIMINATE
- INITIAL_ELIMINATION_OFFSET
- if eliminate fp fail, frame_pointer_needed will be 1
- define in reload1.c: update_eliminables()
Anchored addresses
- GCC anchored addresses optimization could transfer the following code
- On some targets, it would be better to calculate just one symbolic address and access the three variables relative to it
- could use "fsection-anchors" to enable if the target have been implement
static int a = 0, b = 1, c = 2; int foo (void) { return a + b + c; }
- as
int foo (void) { register int *xr = &x; return xr[&a - &x] + xr[&b - &x] + xr[&c - &x]; }
* NOTE: global variables in a object should be strong symbol, or else it would put into common section instead of anchor section.
- To enable anchored addresses at last need define following macro
- SET_ASM_OP: assemlber support for setting an address to symbol
- TARGET_MAX_ANCHOR_OFFSET: max offset relative to base that permit to generate
- TARGET_MIN_ANCHOR_OFFSET: min offset relative to base that permit to generate
- the base would be the anchored section start address.
- Other anchored addresses relative macro
- TARGET_ASM_OUTPUT_ANCHOR: how to setting the anchor section start address
- TARGET_USE_ANCHORS_FOR_SYMBOL_P: what kind symbol could move into anchor section
- mips use this macro to avoid move GP relative and PC relative symbol into anchor section
- Limitation
- Can't handle weak symbol, cause the data type(size) not determinate
- The effect scope in single compile unit (Not IPA)
Structure Passing as Argument
- TARGET_PASS_BY_REFERENCE: determine pass by reference or not
- could use this macro control pass by reg or by reference
MI_THUNK on GCC for C++
- A wrapper function to implement C++ virtual function calls
- TARGET_ASM_OUTPUT_MI_THUNK: update "this" pointer by add delta, and then could use "this" pointer to find the real function body for virtual function inheritance.
ARM ABI
old stack pointer -> | | ---- | | \ | | saved arguments for | | vararg functions | | / -- hard FP & arg pointer -> | | \ | | stack | | frame | | / -- | | \ | | call saved | | registers soft frame pointer -> | | / -- | | \ | | local | | variables locals base pointer -> | | / -- | | \ | | outgoing | | arguments current stack pointer -> | | / --
- ARM define function arm_get_frame_offsets to calculate stack offsets
- It would be call several times when needs frame info.
- arm define the structure arm_stack_offsets in machine_function structure
typedef struct GTY(()) arm_stack_offsets { int saved_args; /* ARG_POINTER_REGNUM. */ = (args.pretend_args_size for variadic functions) int frame; /* ARM_HARD_FRAME_POINTER_REGNUM. */ = (args.pretend_args_size for variadic functions) + (static_chain_stack_bytes) + (frame_pointer when need) int saved_regs; = (args.pretend_args_size for variadic functions) + (stack size for callee saved register) + ('''static_chain_stack_bytes'''); int soft_frame; /* FRAME_POINTER_REGNUM. */ = (saved_regs) + (CALLER_INTERWORKING_SLOT_SIZE ??) int locals_base; /* THUMB_HARD_FRAME_POINTER_REGNUM. */ = (soft_frame) + (local variables size) int outgoing_args; /* STACK_POINTER_REGNUM. */ = (locals_base) + (outgoing_args) unsigned int saved_regs_mask; }
ARM Change ARG_POINTER Alignment for DOUBLEWORD Alignment ABI
- ARM use the following code to change argument pointer alignment
/* Do anything needed before RTL is emitted for each function. */ void arm_init_expanders (void) { /* Arrange to initialize and mark the machine per-function status. */ init_machine_status = arm_init_machine_status; /* This is to stop the combine pass optimizing away the alignment adjustment of va_arg. */ if (cfun) mark_reg_pointer (arg_pointer_rtx, PARM_BOUNDARY); <= setting argument pointer alignment }
- Without these code, the GCC testsuite test case va-arg-26.c may get wrong
#include <stdarg.h> double f (float f1, float f2, float f3, float f4, float f5, float f6, ...) { va_list ap; double d; va_start (ap, f6); d = va_arg (ap, double); va_end (ap); return d; } int main () { if (f (1, 2, 3, 4, 5, 6, 7.0) != 7.0) abort (); exit (0); }
- va-arg-26.c.144r.expand
(insn 11 10 12 (set (reg:SI 134) (plus:SI (reg/f:SI 128 virtual-incoming-args) (const_int 4 [0x4]))) va-arg-26.c:9 -1 (nil)) (insn 14 13 15 (set (reg:SI 149) (plus:SI (reg/f:SI 134 [ D.2028 ]) (const_int 7 [0x7]))) va-arg-26.c:10 -1 (nil)) (insn 15 14 0 (set (reg:SI 137 [ D.2030 ]) (and:SI (reg:SI 149) (const_int -8 [0xfffffffffffffff8]))) va-arg-26.c:10 -1 (nil))
- For Double word alignment ABI
GCC will emit (a + 7) & -8 to fit double word alignment
- GCC could use the macro REGNO_POINTER_ALIGN
to mark the register value in certain alignment
- Registers alignment will initialize in emit-rtl.c
void init_emit (void) { ... #ifdef STACK_BOUNDARY REGNO_POINTER_ALIGN (STACK_POINTER_REGNUM) = STACK_BOUNDARY; REGNO_POINTER_ALIGN (FRAME_POINTER_REGNUM) = STACK_BOUNDARY; REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = STACK_BOUNDARY; REGNO_POINTER_ALIGN (ARG_POINTER_REGNUM) = STACK_BOUNDARY; REGNO_POINTER_ALIGN (VIRTUAL_INCOMING_ARGS_REGNUM) = STACK_BOUNDARY; REGNO_POINTER_ALIGN (VIRTUAL_STACK_VARS_REGNUM) = STACK_BOUNDARY; REGNO_POINTER_ALIGN (VIRTUAL_STACK_DYNAMIC_REGNUM) = STACK_BOUNDARY; REGNO_POINTER_ALIGN (VIRTUAL_OUTGOING_ARGS_REGNUM) = STACK_BOUNDARY; REGNO_POINTER_ALIGN (VIRTUAL_CFA_REGNUM) = BITS_PER_WORD; #endif #ifdef INIT_EXPANDERS INIT_EXPANDERS; #endif }
- ARM will set STACK_BOUNDARY to 64 for DOUBLEWORD alignment ABI
Therefore, ARG_POINTER_REGNUM will set to 64 bit alignemnt.
- in combine phase, combine.c:8156
function force_to_mode ()
case PLUS: /* In (and (plus FOO C1) M), if M is a mask that just turns off low-order bits (as in an alignment operation) and FOO is already aligned to that boundary, mask C1 to that boundary as well. This may eliminate that PLUS and, later, the AND. */ { unsigned int width = GET_MODE_BITSIZE (mode); unsigned HOST_WIDE_INT smask = mask; /* If MODE is narrower than HOST_WIDE_INT and mask is a negative number, sign extend it. */ if (width < HOST_BITS_PER_WIDE_INT && (smask & ((unsigned HOST_WIDE_INT) 1 << (width - 1))) != 0) smask |= (unsigned HOST_WIDE_INT) (-1) << width; if (CONST_INT_P (XEXP (x, 1)) && exact_log2 (- smask) >= 0 && (nonzero_bits (XEXP (x, 0), mode) & ~smask) == 0 && (INTVAL (XEXP (x, 1)) & ~smask) != 0) return force_to_mode (plus_constant (XEXP (x, 0), (INTVAL (XEXP (x, 1)) & smask)), mode, smask, next_select); }
- These code try to eliminate the constant C1
(and (plus FOO C1) M)
- When FOO already fit the alignment in M,
then the lower bit in C1 could optimize out.
EX: (FOO + 11) & 0xfffffff8 could optimize to
(FOO + 8) & 0xfffffff8 when FOO already in mask alignemnt (which is 8-byte alignemnt)
- (nonzero_bits (XEXP (x, 0), mode) & ~smask) == 0 <== check Dose the FOO in mask alignment
- (INTVAL (XEXP (x, 1)) & ~smask) != 0) <== check whether the C1 lower bits not zero
- If ARG_POINTER_REGNUM is 64 bit alignemnt,
(ARG_POINTER_REGNUM + 11) & (-8) will optimize to (ARG_POINTER_REGNUM + 8) which not ARM really want.
- Therefore, ARM use mark_reg_pointer (arg_pointer_rtx, PARM_BOUNDARY)
Set arg_pointer to 32 bit alignemnt.
ARM GCC constant generation
- constant_ok_for_arm: 可用一道指令設定出來的constant值
- 小數值 (8 bit可表示)
- 小數值 + SHIFT
- 小數值 + ROTATE
- 小數值 + OR (partial repeat)
- const_ok_for_op:
- 可minus或可inverted的operation, 轉換後的constant是否符合 const_ok_for_arm
- arm_split_constant
- 由arm_gen_constant計算生成某constant的cost
cost太高時split成move pair, 否則用arm_gen_constant 生成constant
- arm_gen_constant:做constant拆解的分析和generate
- quick check
- IOR:
- a = b or 0xffffffff ==> a = 0xffffffff
- a = b or 0 ==> a = b
- AND:
- a = b and 0 ==> a = 0
- a = b and 0xffffffff ==> a = b
- XOR:
- a = b xor 0 ==> a = b
- a = b xor 0xffffffff ==> a = not (b)
- MINUS
- a = 0 - b ==> a = neg(b)
- 符合const_ok_for_arm的常數,emit出相對應的指令
- IOR:
- more transformation
- SET:
- set low_part
- move then shift
- move then add/sub
- move then or
- IOR/XOR:
- mvn support transformation
- AND: shift transformation
- SET:
- quick check
Trampolines for Nested Functions
- gcc testsuite testcase 20000822-1.c
#ifndef NO_TRAMPOLINES int f0(int (*fn)(int *), int *p) { return (*fn) (p); } int f1(void) { int i = 0; int f2(int *p) { i = 1; return *p + 1; } return f0(f2, &i); } #endif int main() { #ifndef NO_TRAMPOLINES if (f1() != 2) abort (); #endif return 0; }
- arm assembly code
.file "20000822-1.c" .text .align 2 .type f2.1205, %function f2.1205: @ Nested: function declared inside another function. @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 @ link register save eliminated. mov r3, #1 str r3, [ip, #0] <=== load static chain ldr r0, [r0, #0] <=== access variable of function f1 by static chain add r0, r0, r3 bx lr .size f2.1205, .-f2.1205 .align 2 .global f0 .type f0, %function f0: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 mov r3, r0 str lr, [sp, #-4]! mov r0, r1 mov lr, pc bx r3 ldr pc, [sp], #4 .size f0, .-f0 .section .rodata .align 2 .LTRAMP0: ldr ip, [pc, #0] <== load static chain value to ip ldr pc, [pc, #0] <== jump to function f2.1205 .word 0 .word 0 .global __clear_cache .text .align 2 .global f1 .type f1, %function f1: @ args = 0, pretend = 0, frame = 20 @ frame_needed = 0, uses_anonymous_args = 0 stmfd sp!, {r4, lr} ldr r3, #.LTRAMP0 sub sp, sp, #20 ldmia r3, {r0, r1, r2, r3} <== load trampolines code from .LTRAMP0 add r4, sp, #4 stmia r4, {r0, r1, r2, r3} <== store trampolines code to stack add r3, sp, #0 str r3, [r4, #8] <== store static chain value to stack ldr r3, #f2.1205 mov r0, r4 add r1, sp, #20 str r3, [r4, #12] <== store function address to stack bl __clear_cache <== sync icache content with memory mov r3, #0 str r3, [sp, #0] mov r0, sp mov lr, pc bx r4 add sp, sp, #20 ldmfd sp!, {r4, pc} .size f1, .-f1 .section .text.startup,"ax",%progbits .align 2 .global main .type main, %function main: @ args = 0, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 str lr, [sp, #-4]! bl f1 cmp r0, #2 blne abort .L5: mov r0, #0 ldr pc, [sp], #4 .size main, .-main .ident "GCC: (GNU) 4.6.2"
- GCC porting macro for trampolines
/* To generate trampoline code in .LTRAMP0 */ #define TARGET_ASM_TRAMPOLINE_TEMPLATE arm_asm_trampoline_template
/* To emit trampoline initial code, including copy trampoline code to stack and copy static chain and function address to stack */ #define TARGET_TRAMPOLINE_INIT arm_trampoline_init /* Emit RTL insns to initialize the variable parts of a trampoline. */ static void arm_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) { rtx fnaddr, mem, a_tramp; emit_block_move (m_tramp, assemble_trampoline_template (), <== copy trampoline code to stack GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL); mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 8 : 12); emit_move_insn (mem, chain_value); <== store static chain value mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 12 : 16); fnaddr = XEXP (DECL_RTL (fndecl), 0); emit_move_insn (mem, fnaddr); <== store function address a_tramp = XEXP (m_tramp, 0); emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"), <== emit sync icache call LCT_NORMAL, VOIDmode, 2, a_tramp, Pmode, plus_constant (a_tramp, TRAMPOLINE_SIZE), Pmode);
- arm define TARGET_TRAMPOLINE_ADJUST_ADDRESS to set mode to thumb mode for TARGET_THUMB
#define TARGET_TRAMPOLINE_ADJUST_ADDRESS arm_trampoline_adjust_address /* Thumb trampolines should be entered in thumb mode, so set the bottom bit of the address. */ static rtx arm_trampoline_adjust_address (rtx addr) { if (TARGET_THUMB) addr = expand_simple_binop (Pmode, IOR, addr, const1_rtx, NULL, 0, OPTAB_LIB_WIDEN); return addr; }
/* Length in units of the trampoline for entering a nested function. */ #define TRAMPOLINE_SIZE (TARGET_32BIT ? 16 : 20)
/* Alignment required for a trampoline in bits. */ #define TRAMPOLINE_ALIGNMENT 32 /* the trampoline code alignment in data section and in run time stack */
Study of C++ ABI for Itanium: Exception handling
- Personality routine
- 找目前frame的handler, 並判斷current frame找到的handler能不能處理現在catch到的exception
- 用來處理 frame跳轉 相關的設定
- 例如exception object pointer 要如何傳遞, 設定landing pad.
- 是language dependent, 因為不同的語言可能會設計不同的處理方式
- g++定義在 libstdc++-v3/libsupc++/eh_personality.cc
- 在unwind的過程中,會先decode FDE來找到目前frame的 personality routine,存在 再跳進去
- Unwinding的原因有兩種
- exception 例如c++ support的
- 由personality routine判斷有沒有handler可以處理
- _Unwind_RaiseException perform exception style unwinding
- force unwinding 例如 longjmp 或 thread termination
- 不需要personality routine去判斷,只想做直接的跳轉 所以用_UA_FORCE_UNWIND來標記這件事
- _Unwind_ForcedUnwind perform force unwinding
- exception 例如c++ support的
- __attribute__(cleanup) is a GCC extension which will do unwinding
- Therefore, There're also a unwinding library in C source code.
- the attribute will call cleanup function when the variable out of the scope
- http://stackoverflow.com/questions/1828550/portable-equivalent-to-gccs-attribute-cleanup
The Unwind Process
- Search Phase
- unwinding然後call personality routine with _UA_SEARCH_PHASE flag
- unwinding是不斷的利用return address找出current frame的personaltiy routine,再判斷有沒有可以處理的handler存在, 在Search phase並不會真的改變unwind state
- 當Search phase找到可以處理的handler時才會進入第二的phase,沒找會call terminate().
- Cleanup Phase
- unwind然後call personality routine with _UA_CLEARNUP_PHASE flag
- 這次會真的改變register state然後跳轉到 user landing pad code (user handler code)
- Resumptive exception handling and Non-resumptive
- Resumptive model: exception handle執行完之後,程式會從發生exception的點開始執行
- Non-resumptive model: exception handler執行完之後,程式會由catch block的下個statement開始執行
- C++是屬於Non-resumptive
- Two phase unwinding對於Non-resumptive其實不一定需要,但GCC為了兼容不同語言,可能是Resumptive or Non-resumptive而做了two phase的設計
Exception header
- Exception structure in GCC, define in unwind-generic.h
struct _Unwind_Exception 86 { 87 _Unwind_Exception_Class exception_class; 88 _Unwind_Exception_Cleanup_Fn exception_cleanup; 89 _Unwind_Word private_1; 90 _Unwind_Word private_2; 91 92 /* @@@ The IA-64 ABI says that this structure must be double-word aligned. 93 Taking that literally does not make much sense generically. Instead we 94 provide the maximum alignment required by any type for the machine. */ 95 } __attribute__((__aligned__));
- exception_class 和語言相關也和實作相關, 可以用來分辨是同個語言的exception或是不同語言的(native and foreign exceptions)
- exception_cleanup 刪除exception object
- exception_cleanup 的其中一個參數是 _Unwind_Reason_Code reason, 用來描述為何要刪除這個exception object
- _URC_FOREIGN_EXCEPTION_CAUGHT = 1 一些foreign相關導致產生 未定義行為的情況
- _URC_FATAL_PHASE1_ERROR = 3 personality routine在phase 1發生錯誤
- _URC_FATAL_PHASE2_ERROR = 2 personality routine在phase 2發生錯誤
- exception_cleanup 的其中一個參數是 _Unwind_Reason_Code reason, 用來描述為何要刪除這個exception object
Unwind Context
- _Unwind_Context structure用來記錄frame的狀態
- _Unwind_Context for setjmp longjmp unwinding 定義在unwind-sjlj.c
- _Unwind_Context for dwarf2 定義在 unwind-dw2.c
58 /* This is the register and unwind state for a particular frame. This 59 provides the information necessary to unwind up past a frame and return 60 to its caller. */ 61 struct _Unwind_Context 62 { 63 void *reg[DWARF_FRAME_REGISTERS+1]; 64 void *cfa; 65 void *ra; 66 void *lsda; 67 struct dwarf_eh_bases bases; 68 /* Signal frame context. */ 69 #define SIGNAL_FRAME_BIT ((~(_Unwind_Word) 0 >> 1) + 1) 70 /* Context which has version/args_size/by_value fields. */ 71 #define EXTENDED_CONTEXT_BIT ((~(_Unwind_Word) 0 >> 2) + 1) 72 _Unwind_Word flags; 73 /* 0 for now, can be increased when further fields are added to 74 struct _Unwind_Context. */ 75 _Unwind_Word version; 76 _Unwind_Word args_size; 77 char by_value[DWARF_FRAME_REGISTERS+1]; 78 };
Throwing an Exception
- _Unwind_RaiseException define in unwind.inc
79 /* Raise an exception, passing along the given exception object. */ 80 81 _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE 82 _Unwind_RaiseException(struct _Unwind_Exception *exc) 83 { 84 struct _Unwind_Context this_context, cur_context; 85 _Unwind_Reason_Code code; 86 87 /* Set up this_context to describe the current stack frame. */ 88 uw_init_context (&this_context); 89 cur_context = this_context; 90 91 /* Phase 1: Search. Unwind the stack, calling the personality routine 92 with the _UA_SEARCH_PHASE flag set. Do not modify the stack yet. */ 93 while (1) 94 { 95 _Unwind_FrameState fs; 96 97 /* Set up fs to describe the FDE for the caller of cur_context. The 98 first time through the loop, that means __cxa_throw. */ 99 code = uw_frame_state_for (&cur_context, &fs); 100 101 if (code == _URC_END_OF_STACK) 102 /* Hit end of stack with no handler found. */ 103 return _URC_END_OF_STACK; 104 105 if (code != _URC_NO_REASON) 106 /* Some error encountered. Usually the unwinder doesn't 107 diagnose these and merely crashes. */ 108 return _URC_FATAL_PHASE1_ERROR; 109 110 /* Unwind successful. Run the personality routine, if any. */ 111 if (fs.personality) 112 { 113 code = (*fs.personality) (1, _UA_SEARCH_PHASE, exc->exception_class, 114 exc, &cur_context); 115 if (code == _URC_HANDLER_FOUND) 116 break; 117 else if (code != _URC_CONTINUE_UNWIND) 118 return _URC_FATAL_PHASE1_ERROR; 119 } 120 121 /* Update cur_context to describe the same frame as fs. */ 122 uw_update_context (&cur_context, &fs); 123 } 124 125 /* Indicate to _Unwind_Resume and associated subroutines that this 126 is not a forced unwind. Further, note where we found a handler. */ 127 exc->private_1 = 0; 128 exc->private_2 = uw_identify_context (&cur_context); 129 130 cur_context = this_context; 131 code = _Unwind_RaiseException_Phase2 (exc, &cur_context); 132 if (code != _URC_INSTALL_CONTEXT) 133 return code; 134 135 uw_install_context (&this_context, &cur_context); 136 }
- _Unwind_RaiseException 只有在發生錯誤時 會return reason code
- _Unwind_Reason_Code 可能的值
- _URC_END_OF_STACK: unwinding到最後一個stack,而且找不到可以處理的handler
- 在這樣的case下 The C++ runtime最終會call uncaught_exception()來處理
- URC_FATAL_PHASE1_ERROR: 在phase1發生非預期的錯誤 例如 stack corrupt
- 在這樣的情況下最終會call terminate()
- _URC_END_OF_STACK: unwinding到最後一個stack,而且找不到可以處理的handler
- _Unwind_Reason_Code 可能的值
- unwinding發生後 很有可能會改變stack或register的狀態,所以"caller of _Unwind_RaiseException"不能對caller目前的state有所假設
- _Unwind_ForcedUnwind define in unwind.inc
192 /* Raise an exception for forced unwinding. */ 193 194 _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE 195 _Unwind_ForcedUnwind (struct _Unwind_Exception *exc, 196 _Unwind_Stop_Fn stop, void * stop_argument) 197 { 198 struct _Unwind_Context this_context, cur_context; 199 _Unwind_Reason_Code code; 200 201 uw_init_context (&this_context); 202 cur_context = this_context; 203 204 exc->private_1 = (_Unwind_Ptr) stop; 205 exc->private_2 = (_Unwind_Ptr) stop_argument; 206 207 code = _Unwind_ForcedUnwind_Phase2 (exc, &cur_context); 208 if (code != _URC_INSTALL_CONTEXT) 209 return code; 210 211 uw_install_context (&this_context, &cur_context); 212 }
- Forced unwinding is a single-phase process
- 真正的unwinding發生在 _Unwind_ForcedUnwind_Phase2
- Unwinding結束,是由stop和stop_argument決定,而不是由 personality routine
- 當達到停止條件之後 會call _Unwind_DeleteException 然後跳進user code, 否則會return reason code
- _Unwind_ForcedUnwind 可能會return的 _Unwind_Reason_Code
- _URC_NO_REASON: 不是想要到達的frame會在call這個frame的personality routine with action flag _UA_FORCE_UNWIND和_UA_CLEANUP_PHASE 然後unwind到下個frame在call一次stop function.
- _URC_END_OF_STACK: 當unwind到最後一個frame都沒法滿足 stop function時return這個code, 這時context內的sp會設成NULL
- _URC_FATAL_PHASE2_ERROR: stop function可能會return這個code當發生stack corruption
- Example:用Force unwinding 實作longjmp_unwind()
- Setjmp()儲存現在的state包含frame pointer的值
- longjmp_unwind() call _Unwind_ForcedUnwind, stop function設定為unwind到frame pointer的值和Setjmp存的frame pointer 值相同時停止
- 停止後把state還原為Setjmp()儲存的state, 否則就return _URC_NO_REASON 繼續unwind
- _Unwind_Resume define in unwind.inc
215 /* Resume propagation of an existing exception. This is used after 216 e.g. executing cleanup code, and not to implement rethrowing. */ 217 218 void LIBGCC2_UNWIND_ATTRIBUTE 219 _Unwind_Resume (struct _Unwind_Exception *exc) 220 { 221 struct _Unwind_Context this_context, cur_context; 222 _Unwind_Reason_Code code; 223 224 uw_init_context (&this_context); 225 cur_context = this_context; 226 227 /* Choose between continuing to process _Unwind_RaiseException 228 or _Unwind_ForcedUnwind. */ 229 if (exc->private_1 == 0) 230 code = _Unwind_RaiseException_Phase2 (exc, &cur_context); 231 else 232 code = _Unwind_ForcedUnwind_Phase2 (exc, &cur_context); 233 234 gcc_assert (code == _URC_INSTALL_CONTEXT); 235 236 uw_install_context (&this_context, &cur_context); 237 }
- _Unwind_Resume是唯一一個user code可以直接用的function, 可以寫在user landing pad code的最後 繼續做unwinding
- 這和rethrow不同, rethrow的實作必須在call一次 _Unwind_RaiseException
Context Management
- 在GCC裏有dwarf2和setjmp/longjmp的version
- 分別定義在unwind-dw2.c和unwind-sjlj.c
- _Unwind_GetGR:取得context內某個gpr register的value
- _Unwind_SetGR:設定context內某個gpr register的value
- _Unwind_GetIP:取得context的return value
- _Unwind_SetIP:設定context的return value
- _Unwind_GetLanguageSpecificData: returns the address of the "language-specific data area"(LSDA) for the current stack frame.
- _Unwind_GetRegionStart: This routine returns the address of the beginning of the procedure or code fragment described by the current unwind descriptor block.
Personality Routine
- The personality routine is the interface between "system unwind library" and "language-specific exception handling semantic"
_Unwind_Reason_Code (*__personality_routine) (int version, _Unwind_Action actions, uint64 exceptionClass, struct _Unwind_Exception *exceptionObject, struct _Unwind_Context *context);
- personality routine的參數
- version: 用來偵測unwinder和personality routine有沒有match
- action: 描述personality routine的狀態 是一個bit mask
- exceptionClass: 8byte的大小用來描述execption type
- high 4byte 描述vendor 例如 HP
- low 4 byte 描述語言 例如 C++
- context: current frame context (Unwind_Context)
- Personality routine actions
- _UA_SEARCH_PHASE: 搜尋目前的frame有沒有可以處理目前exception的handler
- 找到handler回傳 URC_HANDLER_FOUND 沒找到回傳 _URC_CONTINUE_UNWIND
- _UA_CLEANUP_PHASE: 目前的frame要做cleanup的動作,會利用SET_IP設定ra到landing pad然後return code _URC_INSTALL_CONTEXT
- _UA_HANDLER_FRAME: 這用在phase2, 用來標記現在的frame就是在phase1找到handler的那個frame.
- _UA_FORCE_UNWIND: 摽記目前是做force unwind, 不需要personalty routine判斷可不可以處理這個excetpion.
- _UA_SEARCH_PHASE: 搜尋目前的frame有沒有可以處理目前exception的handler
C++ Exception object create and destroy
- Mapping to Level II:C++ ABI in http://refspecs.linuxfoundation.org/abi-eh-1.22.html
C++ Exception objects
- C++ exception object structure defined in gcc_folder/libstdc++-v3/libsupc++/unwind-cxx.h
45 // A primary C++ exception object consists of a header, which is a wrapper 46 // around an unwind object header with additional C++ specific information, 47 // followed by the exception object itself. 48 49 struct __cxa_exception 50 { 51 // Manage the exception object itself. 52 std::type_info *exceptionType; 53 void (*exceptionDestructor)(void *); 54 55 // The C++ standard has entertaining rules wrt calling set_terminate 56 // and set_unexpected in the middle of the exception cleanup process. 57 std::unexpected_handler unexpectedHandler; 58 std::terminate_handler terminateHandler; 59 60 // The caught exception stack threads through here. 61 __cxa_exception *nextException; 62 63 // How many nested handlers have caught this exception. A negated 64 // value is a signal that this object has been rethrown. 65 int handlerCount; 66 67 #ifdef __ARM_EABI_UNWINDER__ 68 // Stack of exceptions in cleanups. 69 __cxa_exception* nextPropagatingException; 70 71 // The nuber of active cleanup handlers for this exception. 72 int propagationCount; 73 #else 74 // Cache parsed handler data from the personality routine Phase 1 75 // for Phase 2 and __cxa_call_unexpected. 76 int handlerSwitchValue; 77 const unsigned char *actionRecord; 78 const unsigned char *languageSpecificData; 79 _Unwind_Ptr catchTemp; 80 void *adjustedPtr; 81 #endif 82 83 // The generic exception header. Must be last. 84 _Unwind_Exception unwindHeader; 85 };
- Caught Exception Stack
- define in gcc_folder/libstdc++-v3/libsupc++/unwind-cxx.h
136 // Each thread in a C++ program has access to a __cxa_eh_globals object. 137 struct __cxa_eh_globals 138 { 139 __cxa_exception *caughtExceptions; 140 unsigned int uncaughtExceptions; 141 #ifdef __ARM_EABI_UNWINDER__ 142 __cxa_exception* propagatingExceptions; 143 #endif 144 };
- caughtExceptions:
- a global exception link list for each thread
- the most recent first
- linked through nextException of exception header
Throwing an Exception
- call __cxa_allocate_exception to create exception object and then call __cxa_throw to throw it
- __cxa_allocate_exception define in gcc_folder/libstdc++-v3/libsupc++/eh_alloc.cc
96 extern "C" void * 97 __cxxabiv1::__cxa_allocate_exception(std::size_t thrown_size) throw() 98 { 99 void *ret; 100 101 thrown_size += sizeof (__cxa_refcounted_exception); 102 ret = malloc (thrown_size); 103 104 if (! ret) 105 { 106 __gnu_cxx::__scoped_lock sentry(emergency_mutex); 107 108 bitmask_type used = emergency_used; 109 unsigned int which = 0; 110 111 if (thrown_size > EMERGENCY_OBJ_SIZE) 112 goto failed; 113 while (used & 1) 114 { 115 used >>= 1; 116 if (++which >= EMERGENCY_OBJ_COUNT) 117 goto failed; 118 } 119 120 emergency_used |= (bitmask_type)1 << which; 121 ret = &emergency_buffer[which][0]; 122 123 failed:; 124 125 if (!ret) 126 std::terminate (); 127 } 128 129 // We have an uncaught exception as soon as we allocate memory. This 130 // yields uncaught_exception() true during the copy-constructor that 131 // initializes the exception object. See Issue 475. 132 __cxa_eh_globals *globals = __cxa_get_globals (); 133 globals->uncaughtExceptions += 1; 134 135 memset (ret, 0, sizeof (__cxa_refcounted_exception)); 136 137 return (void *)((char *)ret + sizeof (__cxa_refcounted_exception)); 138 }
- Exception object must be thread safe, therefore normally be allocated in the heap
Exception Handlers
- exception handler一定會呼叫到 __cxa_begin_catch 和 __cxa_end_catch
- __cxa_begin_catch define in gcc_folder/libstdc++-v3/libsupc++/eh_catch.cc
39 extern "C" void * 40 __cxxabiv1::__cxa_begin_catch (void *exc_obj_in) throw() 41 { 42 _Unwind_Exception *exceptionObject 43 = reinterpret_cast <_Unwind_Exception *>(exc_obj_in); 44 __cxa_eh_globals *globals = __cxa_get_globals (); 45 __cxa_exception *prev = globals->caughtExceptions; 46 __cxa_exception *header = __get_exception_header_from_ue (exceptionObject); 47 void* objectp; .....
- __cxa_begin_catch會做
- exception handler count +1
- 把exception加進global的caught exception
- uncaught exception count -1
30 extern "C" void * 31 __cxxabiv1::__cxa_get_exception_ptr(void *exc_obj_in) throw() 32 { 33 _Unwind_Exception *exceptionObject 34 = reinterpret_cast <_Unwind_Exception *>(exc_obj_in); 35 36 return __gxx_caught_object(exceptionObject); 37 }
- __cxa_get_exception_ptr回傳adjust pointer, 當catch的參數不複雜時 不會呼叫
- __cxa_end_catch define in gcc_folder/libstdc++-v3/libsupc++/eh_catch.cc
90 extern "C" void 91 __cxxabiv1::__cxa_end_catch () 92 { 93 __cxa_eh_globals *globals = __cxa_get_globals_fast (); 94 __cxa_exception *header = globals->caughtExceptions; 95 96 // A rethrow of a foreign exception will be removed from the 97 // the exception stack immediately by __cxa_rethrow. 98 if (!header) 99 return; .....
- handler count of the exception-1
- 當handler count為0時 把exception 從caught exception stack中移除
- Destroy the exception if the handler count == 0
Exception handling Porting in GCC
- http://luse.blogspot.tw/2010/01/c.html
- C++ ABI for Itanium: Exception handling
- sjlj vs dw2 in GCC
- http://stackoverflow.com/questions/318383/exception-handling-models-of-gcc
- sjlj的作法 是把current context都存在current stack中然後呼叫 _Unwind_SjLj_Register 把著些context chain起來, unwind install 時在把register 設為handler在的context
- sjlj為了紀錄所有frame的context, 每call 一個function 就會呼叫一次_Unwind_SjLj_Register, performance上可能會比較差, 因為每call個frame就要儲存所有的資訊
- dw2 unwinding則是利用存在.eh_frame sectino裏的CIE和FDE判斷 unwinding時需要設回哪些register不需要每個frame都在執行一次store context的動作,所以 performance應該會比較好.可是為了儲存.eh_frame的資訊,codesize也會大一些
- GCC exception table
- Support c++ exception in newlib
- ELF startup code
- .eh_frame section
- .eh_frame_hdr
- .gcc_except_table
/* Use which register to pass data to exception handling library routines */ /* Also need save these register in prolog */ #define EH_RETURN_DATA_REGNO(N) /* The data would be a exception object pointer which will be initial in PERSONALITY_FUNCTION */
/* Use which register to adjust $sp before leave _Unwind_Resume */ /* Need emit the sp adjust in exception handler epilog */ #define EH_RETURN_STACKADJ_RTX
Implement EH_RETURN_HANDLER_RTX or eh_return pattern to modify return address then jump to exception handler when return from exception handling library routines
- When we write throw in C++ program, the assembly will generate following calls
- __cxa_allocate_exception
- create exception object
- source code in: gcc_folder/libstdc++-v3/libsupc++/eh_alloc.cc
- __cxa_throw
- create exception header and then call _Unwind_RaiseException or _Unwind_SjLj_RaiseException
- source code in: gcc_folder/libstdc++-v3/libsupc++/eh_throw.cc
- _Unwind_Resume
- Resume propagation of an existing exception
- source code in : gcc_folder/gcc/unwind.inc which will include to unwind-sjlj.c and unwind-dw2.c
- __cxa_allocate_exception
- Definition of FDE and CIE in unwind-dw2-fde.h
/* Terminology: CIE - Common Information Element FDE - Frame Descriptor Element There is one per function, and it describes where the function code is located, and what the register lifetimes and stack layout are within the function. The data structures are defined in the DWARF specification, although not in a very readable way (see LITERATURE). Every time an exception is thrown, the code needs to locate the FDE for the current function, and starts to look for exception regions from that FDE. This works in a two-level search: a) in a linear search, find the shared image (i.e. DLL) containing the PC b) using the FDE table for that shared object, locate the FDE using binary search (which requires the sorting). */
- _Unwind_RaiseException source in unwind.inc
- Phase1 find the handler without change stack (with flag _UA_SEARCH_PHASE)
- call uw_frame_state_for (struct _Unwind_Context *context, _Unwind_FrameState *fs)
- According to context->ra to find the caller frame state
- will call _Unwind_Find_FDE (context->ra, ..) find FDE from dwarf2 object and then decode FDE and the update to fs structure
- According to context->ra to find the caller frame state
- Unwinding until find handler
- call uw_frame_state_for (struct _Unwind_Context *context, _Unwind_FrameState *fs)
- Phase1 find the handler without change stack (with flag _UA_SEARCH_PHASE)
- _Unwind_RaiseException_Phase2 source in unwind.inc
- Phase2 unwind and change stack to pass exception object to handler
- Phase2 will call (*fs.personality) with _UA_CLEANUP_PHASE flag
- which will update context to pass exception object
- PERSONALITY_FUNCTION in libstdc++-v3/libsupc++/eh_personality.cc
- Phase2 unwind and change stack to pass exception object to handler
... _Unwind_SetGR (context, __builtin_eh_return_data_regno (0), (_Unwind_Ptr) ue_header); _Unwind_SetGR (context, __builtin_eh_return_data_regno (1), 0); _Unwind_SetIP (context, landing_pad); return _URC_INSTALL_CONTEXT; }
- call uw_install_context in _Unwind_RaiseException_Phase2 to change control to handler
/* Install TARGET into CURRENT so that we can return to it. This is a macro because __builtin_eh_return must be invoked in the context of our caller. */ #define uw_install_context(CURRENT, TARGET) \ do \ { \ long offset = uw_install_context_1 ((CURRENT), (TARGET)); \ void *handler = __builtin_frob_return_addr ((TARGET)->ra); \ _Unwind_DebugHook ((TARGET)->cfa, handler); \ __builtin_eh_return (offset, handler); \ } \ while (0)
- uw_install_context_1: recover handler registers and calculate sp offset
- sp offset will set to EH_RETURN_STACKADJ_RTX
- __builtin_frob_return_addr: get handler address
- _builtin_eh_return: modify return address change control to exception hanlder
- __builtin_eh_return in ARM
arm_set_return_address (rtx source, rtx scratch) { arm_stack_offsets *offsets; HOST_WIDE_INT delta; rtx addr; unsigned long saved_regs; offsets = arm_get_frame_offsets (); saved_regs = offsets->saved_regs_mask; if ((saved_regs & (1 << LR_REGNUM)) == 0) <== if return address in LR_REGNUM, emit a move to change return address to handler emit_move_insn (gen_rtx_REG (Pmode, LR_REGNUM), source); else <== if return address in memory, change return address to handler { if (frame_pointer_needed) addr = plus_constant(hard_frame_pointer_rtx, -4); else { /* LR will be the first saved register. */ delta = offsets->outgoing_args - (offsets->frame + 4); if (delta >= 4096) { emit_insn (gen_addsi3 (scratch, stack_pointer_rtx, GEN_INT (delta & ~4095))); addr = scratch; delta &= 4095; } else addr = stack_pointer_rtx; addr = plus_constant (addr, delta); } emit_move_insn (gen_frame_mem (Pmode, addr), source); } }
Use GCC plugin to add a new optimization pass
- A plugin registers a new pass with GCC by calling register_callback with the PLUGIN_PASS_MANAGER_SETUP event and a pointer to a struct register_pass_info object defined as follows
- define in tree-pass.h
enum pass_positioning_ops { PASS_POS_INSERT_AFTER, // Insert after the reference pass. PASS_POS_INSERT_BEFORE, // Insert before the reference pass. PASS_POS_REPLACE // Replace the reference pass. };
struct register_pass_info { struct opt_pass *pass; /* New pass to register. */ const char *reference_pass_name; /* Name of the reference pass for hooking up the new pass. */ int ref_pass_instance_number; /* Insert the pass at the specified instance number of the reference pass. Do it for every instance if it is 0. */ enum pass_positioning_ops pos_op; /* how to insert the new pass. */ };
/* Optimization pass type. */ enum opt_pass_type { GIMPLE_PASS, RTL_PASS, SIMPLE_IPA_PASS, IPA_PASS };
/* Describe one pass; this is the common part shared across different pass types. */ struct opt_pass { /* Optimization pass type. */ enum opt_pass_type type; /* Terse name of the pass used as a fragment of the dump file name. If the name starts with a star, no dump happens. */ const char *name; /* If non-null, this pass and all sub-passes are executed only if the function returns true. */ bool (*gate) (void); /* This is the code to run. If null, then there should be sub-passes otherwise this pass does nothing. The return value contains TODOs to execute in addition to those in TODO_flags_finish. */ unsigned int (*execute) (void); /* A list of sub-passes to run, dependent on gate predicate. */ struct opt_pass *sub; /* Next in the list of passes to run, independent of gate predicate. */ struct opt_pass *next; /* Static pass number, used as a fragment of the dump file name. */ int static_pass_number; /* The timevar id associated with this pass. */ /* ??? Ideally would be dynamically assigned. */ timevar_id_t tv_id; /* Sets of properties input and output from this pass. */ unsigned int properties_required; unsigned int properties_provided; unsigned int properties_destroyed; /* Flags indicating common sets things to do before and after. */ unsigned int todo_flags_start; unsigned int todo_flags_finish; };
/* Sample plugin code that registers a new pass. */ int plugin_init (struct plugin_name_args *plugin_info, struct plugin_gcc_version *version) { struct register_pass_info pass_info; ... /* Code to fill in the pass_info object with new pass information. */ ... /* Register the new pass. */ register_callback (plugin_info->base_name, PLUGIN_PASS_MANAGER_SETUP, NULL, &pass_info); ... }
Inline rules in GCC
- according the inline relative parameter default setting in params.def
- all instruction number means internal gcc instruction numbers, not assembly instruction numbers
# The function marked inline and the method defined in class if insn numbers < 400 # when -finline-functions (-O3) is enable, the function insns < 40 will auto inlined # The maximum number of instructions inline function can grow to via recursive inlining is 450 # The maximum number of instructions non-inline function can grow to via recursive inlining is 450 # The maximum depth of recursive inlining for inline functions is 8 # The maximum depth of recursive inlining for non-inline functions is 8 # Inline recursively only when the probability of call being executed exceeds the parameter is 10 # The maximum number of nested indirect inlining performed by early inliner is 10 # The size of function body to be considered large is 2700 # Maximal growth due to inlining of large function (in percent) 100 # The size of translation unit to be considered large 10000 # How much can given compilation unit grow because of the inlining (in percent) is 30 # How much can given compilation unit grow because of the interprocedural constant propagation (in percent) 10 # Maximal estimated growth of function body caused by early inlining of single call is 10 # The size of stack frame to be considered large is 256 # Maximal stack frame growth due to inlining (in percent) is 1000
RTX cost in GCC
- /home/cychen/source/462/gcc/optabs.c
- in avoid_expensive_constant,
set rtx_cost to guide gcc generate constant in operation (binoptab->code) or just force the constant to register.
static rtx avoid_expensive_constant (enum machine_mode mode, optab binoptab, rtx x, bool unsignedp) { bool speed = optimize_insn_for_speed_p (); if (mode != VOIDmode && optimize && CONSTANT_P (x) && rtx_cost (x, binoptab->code, speed) > rtx_cost (x, SET, speed)) { if (CONST_INT_P (x)) { HOST_WIDE_INT intval = trunc_int_for_mode (INTVAL (x), mode); if (intval != INTVAL (x)) x = GEN_INT (intval); } else x = convert_modes (mode, VOIDmode, x, unsignedp); x = force_reg (mode, x); } return x; }
- gcc_source/gcc/loop-invariant.c
- In create_new_invariant
GCC will use address cost compare with magic number 3 to determine the insn is cheap or not. The cheap address will no accumulate the rtx cost of the insn. Then the insn may not move out of the loop.
static struct invariant * create_new_invariant (struct def *def, rtx insn, bitmap depends_on, bool always_executed) { struct invariant *inv = XNEW (struct invariant); rtx set = single_set (insn); bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn)); inv->def = def; inv->always_executed = always_executed; inv->depends_on = depends_on; /* If the set is simple, usually by moving it we move the whole store out of the loop. Otherwise we save only cost of the computation. */ if (def) <== When the Insn will set value to register, def will be true. { inv->cost = rtx_cost (set, SET, speed); /* ??? Try to determine cheapness of address computation. Unfortunately the address cost is only a relative measure, we can't really compare it with any absolute number, but only with other address costs. But here we don't have any other addresses, so compare with a magic number anyway. It has to be large enough to not regress PR33928 (by avoiding to move reg+8,reg+16,reg+24 invariants), but small enough to not regress 410.bwaves either (by still moving reg+reg invariants). See http://gcc.gnu.org/ml/gcc-patches/2009-10/msg01210.html . */ inv->cheap_address = address_cost (SET_SRC (set), word_mode, ADDR_SPACE_GENERIC, speed) < 3; } else { inv->cost = rtx_cost (SET_SRC (set), SET, speed); inv->cheap_address = false; } ..
- gcc_source/gcc/fwprop.c
- In try_fwprop_subst
/* Try substituting NEW into LOC, which originated from forward propagation of USE's value from DEF_INSN. SET_REG_EQUAL says whether we are substituting the whole SET_SRC, so we can set a REG_EQUAL note if the new insn is not recognized. Return whether the substitution was performed. */ static bool try_fwprop_subst (df_ref use, rtx *loc, rtx new_rtx, rtx def_insn, bool set_reg_equal) { rtx insn = DF_REF_INSN (use); rtx set = single_set (insn); rtx note = NULL_RTX; bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn)); int old_cost = 0; bool ok; update_df_init (def_insn, insn); /* forward_propagate_subreg may be operating on an instruction with multiple sets. If so, assume the cost of the new instruction is not greater than the old one. */ if (set) old_cost = rtx_cost (SET_SRC (set), SET, speed); ... validate_unshare_change (insn, loc, new_rtx, true); if (!verify_changes (0)) { if (dump_file) fprintf (dump_file, "Changes to insn %d not recognized\n", INSN_UID (insn)); ok = false; } else if (DF_REF_TYPE (use) == DF_REF_REG_USE && set && rtx_cost (SET_SRC (set), SET, speed) > old_cost) { if (dump_file) fprintf (dump_file, "Changes to insn %d not profitable\n", INSN_UID (insn)); ok = false; } ...
- gcc_source/gcc/fwprop.c
- In should_replace_address
static bool should_replace_address (rtx old_rtx, rtx new_rtx, enum machine_mode mode, addr_space_t as, bool speed) { int gain; if (rtx_equal_p (old_rtx, new_rtx) || !memory_address_addr_space_p (mode, new_rtx, as)) return false; /* Copy propagation is always ok. */ if (REG_P (old_rtx) && REG_P (new_rtx)) return true; /* Prefer the new address if it is less expensive. */ gain = (address_cost (old_rtx, mode, as, speed) - address_cost (new_rtx, mode, as, speed)); /* If the addresses have equivalent cost, prefer the new address if it has the highest `rtx_cost'. That has the potential of eliminating the most insns without additional costs, and it is the same that cse.c used to do. */ if (gain == 0) gain = rtx_cost (new_rtx, SET, speed) - rtx_cost (old_rtx, SET, speed); return (gain > 0); }
Loop Invariant Code Motion in GCC
- Move loop invariant out of loop may have performance improvement
- However, it may increase the register pressure and produce spill code.
- Therefore, GCC need a way to estimate register pressure increment for Loop Invariant Code Motion (LICM).
- GCC have two way to estimate register pressure for LICM.
- IRA way register pressure estimation
- Use IRA based register pressure calculation in RTL loop optimizations.
- Use -fira-loop-pressure to enable
- Scan the register pressure increment for each cover class.
- Non-IRA way register pressure estimation
- IRA way register pressure estimation
- gain_for_invariant in loop-invariant.c
- Get computation cost(comp_cost) from rtx_cost which will simulate performance gain
- Get size cost(size_cost) from IRA way or non-IRA way which will simulate size increment from spill code
- The function will return gain = comp_cost - size_cost
- Non-positive gains will not do the code motion.
- Use two array to record register pressure increment
- new_regs: The register increment of all loop invariant except current one (accumulate)
- regs_needed: The register increment of current loop invariant (current one)
- Non-IRA way will call "estimate_reg_pressure_cost" which define in cfgloopanal.c
/* Estimates cost of increased register pressure caused by making N_NEW new registers live around the loop. N_OLD is the number of registers live around the loop. If CALL_P is true, also take into account that call-used registers may be clobbered in the loop body, reducing the number of available registers before we spill. */ unsigned estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed, bool call_p) { unsigned cost; unsigned regs_needed = n_new + n_old; unsigned available_regs = target_avail_regs; /* If there is a call in the loop body, the call-clobbered registers are not available for loop invariants. */ if (call_p) available_regs = available_regs - target_clobbered_regs; /* If we have enough registers, we should use them and not restrict the transformations unnecessarily. */ if (regs_needed + target_res_regs <= available_regs) return 0; if (regs_needed <= available_regs) /* If we are close to running out of registers, try to preserve them. */ cost = target_reg_cost [speed] * n_new; else /* If we run out of registers, it is very expensive to add another one. */ cost = target_spill_cost [speed] * n_new; if (optimize && (flag_ira_region == IRA_REGION_ALL || flag_ira_region == IRA_REGION_MIXED) && number_of_loops () <= (unsigned) IRA_MAX_LOOPS_NUM) /* IRA regional allocation deals with high register pressure better. So decrease the cost (to do more accurate the cost calculation for IRA, we need to know how many registers lives through the loop transparently). */ cost /= 2; return cost; }
- Setting target_reg_cost and target_spill_cost in init_set_costs
for (speed = 0; speed < 2; speed++) { crtl->maybe_hot_insn_p = speed; /* Set up the costs for using extra registers: 1) If not many free registers remain, we should prefer having an additional move to decreasing the number of available registers. (TARGET_REG_COST). 2) If no registers are available, we need to spill, which may require storing the old value to memory and loading it back (TARGET_SPILL_COST). */ start_sequence (); emit_move_insn (reg1, reg2); seq = get_insns (); end_sequence (); target_reg_cost [speed] = seq_cost (seq, speed); start_sequence (); emit_move_insn (mem, reg1); emit_move_insn (reg2, mem); seq = get_insns (); end_sequence (); target_spill_cost [speed] = seq_cost (seq, speed); }
- IRA way to estimate register pressure
for (i = 0; i < ira_reg_class_cover_size; i++) { cover_class = ira_reg_class_cover[i]; if ((int) new_regs[cover_class] + (int) regs_needed[cover_class] + LOOP_DATA (curr_loop)->max_reg_pressure[cover_class] + IRA_LOOP_RESERVED_REGS > ira_available_class_regs[cover_class]) break; } if (i < ira_reg_class_cover_size) /* There will be register pressure excess and we want not to make this loop invariant motion. All loop invariants with non-positive gains will be rejected in function find_invariants_to_move. Therefore we return the negative number here. One could think that this rejects also expensive loop invariant motions and this will hurt code performance. However numerous experiments with different heuristics taking invariant cost into account did not confirm this assumption. There are possible explanations for this result: o probably all expensive invariants were already moved out of the loop by PRE and gimple invariant motion pass. o expensive invariant execution will be hidden by insn scheduling or OOO processor hardware because usually such invariants have a lot of freedom to be executed out-of-order. Another reason for ignoring invariant cost vs spilling cost heuristics is also in difficulties to evaluate accurately spill cost at this stage. */ return -1; else size_cost = 0;
- IRA_LOOP_RESERVED_REGS define in params.def
DEFPARAM (PARAM_IRA_LOOP_RESERVED_REGS, "ira-loop-reserved-regs", "The number of registers in each class kept unused by loop invariant motion", 2, 0, 0)
Data speculation and control speculation in Itanium
GCC instruction scheduling
- GCC have two phase to do instruction scheduling
- one before register allocation and may do interblock scheduling
- one after register allocation and the region will be single basic block
- Scheduling main entry function schedule_insns in sched-rgn.c
/* The one entry point in this file. */ 3299 void 3300 schedule_insns (void) 3301 { 3302 int rgn; 3303 3304 /* Taking care of this degenerate case makes the rest of 3305 this code simpler. */ 3306 if (n_basic_blocks == NUM_FIXED_BLOCKS) 3307 return; 3308 3309 rgn_setup_common_sched_info (); 3310 rgn_setup_sched_infos (); 3311 3312 haifa_sched_init (); 3313 sched_rgn_init (reload_completed); 3314 3315 bitmap_initialize (¬_in_df, 0); 3316 bitmap_clear (¬_in_df); 3317 3318 /* Schedule every region in the subroutine. */ 3319 for (rgn = 0; rgn < nr_regions; rgn++) 3320 if (dbg_cnt (sched_region)) 3321 schedule_region (rgn); 3322 3323 /* Clean up. */ 3324 sched_rgn_finish (); 3325 bitmap_clear (¬_in_df); 3326 3327 haifa_sched_finish (); 3328 }
- The sched1 pass before register allocation may do interblock scheduling
- before do interblock scheduling will call find_rgns() in sched-rgn.c
1068 /* Wrapper function. 1069 If FLAG_SEL_SCHED_PIPELINING is set, then use custom function to form 1070 regions. Otherwise just call find_rgns_haifa. */ 1071 static void 1072 find_rgns (void) 1073 { 1074 if (sel_sched_p () && flag_sel_sched_pipelining) 1075 sel_find_rgns (); 1076 else 1077 haifa_find_rgns (); 1078 }
- haifa_find_rgns () will initial the region
- Identify inner loop as region
- perform DFS search to find back edge (find loop)
- Use dominate info to identify nature loop (loop header dominate all loop body blocks)
- when the flag max-sched-extend-regions-iters is not zero
- perform extend regions
- The idea is to topologically walk through CFG in top-down order.
- if all the predecessors of a node are marked to be in the same region then current node is also marked to be a part of that region.
- Otherwise the node starts its own region
- The idea is to topologically walk through CFG in top-down order.
- perform extend regions
- Other basic blocks not in any region will form to a region as single basic block.
- Identify inner loop as region
- After form regions, call schedule_region () do region scheduling
- call compute_block_dependences(int bb) compute dependences inside bb
- call sched_analyze () in compute_block_dependences ()
- call deps_analyze_insn() in sched_analyze ()
- For jump instruction set memory barrier for scheduling
- memory access instruction won't cross jump instructions
- For call instructions
- If is a SETJMP instruction, Assume that all registers, may be clobbered by this call
- clobber caller save register
- call sched_analyze_insn ()
- Analyze an INSN with pattern X to find all dependencies.
- For jump instruction set memory barrier for scheduling
- call deps_analyze_insn() in sched_analyze ()
- call sched_analyze () in compute_block_dependences ()
- call compute_block_dependences(int bb) compute dependences inside bb
- Compute priority for each insn in the region
2897 void 2898 compute_priorities (void) 2899 { 2900 int bb; 2901 2902 current_sched_info->sched_max_insns_priority = 0; 2903 for (bb = 0; bb < current_nr_blocks; bb++) 2904 { 2905 rtx head, tail; 2906 2907 gcc_assert (EBB_FIRST_BB (bb) == EBB_LAST_BB (bb)); 2908 get_ebb_head_tail (EBB_FIRST_BB (bb), EBB_LAST_BB (bb), &head, &tail); 2909 2910 if (no_real_insns_p (head, tail)) 2911 continue; 2912 2913 rgn_n_insns += set_priorities (head, tail); 2914 } 2915 current_sched_info->sched_max_insns_priority++; 2916 }
- set_priorities will call priority (rtx insn)
- the priority of each insn is max (cost + priority_of_each_consumer)
- cost is the cycle latency from the insn issue and results
- depend on scheduling porting
- cost is the cycle latency from the insn issue and results
- Therefore, the following insn will have high priority
- long latency insn
- have high priority consumer
- lots of successor insn
- the priority of each insn is max (cost + priority_of_each_consumer)
224 /* An instruction is ready to be scheduled when all insns preceding it 225 have already been scheduled. It is important to ensure that all 226 insns which use its result will not be executed until its result 227 has been computed. An insn is maintained in one of four structures: 228 229 (P) the "Pending" set of insns which cannot be scheduled until 230 their dependencies have been satisfied. 231 (Q) the "Queued" set of insns that can be scheduled when sufficient 232 time has passed. 233 (R) the "Ready" list of unscheduled, uncommitted insns. 234 (S) the "Scheduled" list of insns. 235 236 Initially, all insns are either "Pending" or "Ready" depending on 237 whether their dependencies are satisfied. 238 239 Insns move from the "Ready" list to the "Scheduled" list as they 240 are committed to the schedule. As this occurs, the insns in the 241 "Pending" list have their dependencies satisfied and move to either 242 the "Ready" list or the "Queued" set depending on whether 243 sufficient time has passed to make them ready. As time passes, 244 insns move from the "Queued" set to the "Ready" list. 245 246 The "Pending" list (P) are the insns in the INSN_FORW_DEPS of the 247 unscheduled insns, i.e., those that are ready, queued, and pending. 248 The "Queued" set (Q) is implemented by the variable `insn_queue'. 249 The "Ready" list (R) is implemented by the variables `ready' and 250 `n_ready'. 251 The "Scheduled" list (S) is the new insn chain built by this pass. 252 253 The transition (R->S) is implemented in the scheduling loop in 254 `schedule_block' when the best insn to schedule is chosen. 255 The transitions (P->R and P->Q) are implemented in `schedule_insn' as 256 insns move from the ready list to the scheduled list. 257 The transition (Q->R) is implemented in 'queue_to_insn' as time 258 passes or stalls are introduced. */
Phase ordering Issues of Register Allocation and Scheduling
- Register allocation and scheduling have conflict goal
- Register allocation: Minimize spill code, create sequential code for register reuse
- Scheduler: Try to fill all parallel unit and make the register pressure higher
- Instruction Scheduling followed by Register Allocation followed by post-scheduling
- GCC use the order sched1->RA->sched2
- schedule before RA have more opportunity to find ILP
- could perform renaming to resolve output/anti dependency therefore have wilder range to schedule
- may increase register usage
- post-scheduling: scheduling again after generate spill code
- won't increase register usage
- Current GCC also have cooperative approaches
- a register pressure sensitive instruction scheduling before RA
- Use flag -fsched-pressure to enable it
Data flow initialization in GCC
- in phase .dfinit
- rest_of_handle_df_initialize () in df-core.c
- Add problems
- df_scan_add_problem()
- df_lr_add_problem()
- if (optimize > 1) df_live_add_problem()
- initialize df_d data structure
- df_hard_reg_init () : setup eliminate registers
- df_compute_register_ever_live (true)
- compute regs_ever_live
- parameter determine reset or not
- df_scan_blocks() : scan all of the blocks in block_to_analyze
- scan all blocks in function if block_to_analyze is NULL
- df_get_regular_block_artificial_uses
- regular_block: non-entry, non-exit basic block
- artificial use:
- frame pointer
- hardware frame pointer
- argument pointer
- PIC_OFFSET_TABLE_REGNUM
- stack pointer
- EH_RETURN_DATA_REGNO
- Add problems
- rest_of_handle_df_initialize () in df-core.c
- data flow problems (df-problems.c)
- reaching definition
- upward exposed uses
- live variables
- LR problem
- UR problem
- the intersection of LR and UR
- un-initialized variables
- def-use chain
- use-def chain
- Example of using dataflow routines
- df_*_add_problem (flags)
- add a problem
- a problem could depend on other problem
- DU-chain or UD-chain is dependent on solving reaching definition
- df_set_blocks (blocks)
- setup the region of dataflow analysis
- default scan entire function without the df_set_blocks
- df_analyze (): solve the problems
- df_dump(stderr) : dump debug log
- df_finish_pass() : call df_remove_problem()
- df_*_add_problem (flags)
- Data structure of dataflow relative
- reg-def: all the define of the register
- reg-use: all the use of the register
- insn-def: all the define of the insn
- insn-use: all the use of the insn
- def-use: all the use of the define
- use-def: all the define of the use
- df insn info kept in an array DF_INSN_INFO
- index by insn uid
- Each insn has 3 list
- defs list: access by DF_INSN_INFO_DEFS
- uses list: access by DF_INSN_INFO_USES
- equal list: REG_EQUAL
- Each insn has logical uid (LUID) store in DF_INSN_INFO
- LUID only correct after call df_analyze ()
- artificial defs may exist in basic block end to mark always live
- Ex: stack pointer
- Control flow graph wiki
LMA/VMA
- LMA:load address
- VMA:virtual address
- GNU linker provide the feature for the code
- run on ROM and need to access global data (data section)
- However, ROM only can read
- Therefore, linker provide the feature to access data section in VMA (RAM)
- Which means data section in image would place on LMA but global data access would base on VMA
- User need to copy data section from LMA(ROM image) to VMA(RAM) in initial code.
- Reference: http://www.delorie.com/gnu/docs/binutils/ld_33.html
LLVM Note
- Modify configure files for new target
- configure
- autoconf/config.sub
- projects/sample/autoconf/config.sub
- include/llvm/ADT/Triple.h
- lib/Support/Triple.cpp
- Use TableGen to describe target in LLVM.
- TableGen wiki
- Some preserve word example for *.td
- test/TableGen/SetTheory.td
- Use tblgen binary to check the *.td file after transfer to c++
- http://llvm.org/docs/CommandGuide/tblgen.html
- $ llvm-tblgen lib/Target/X86/X86.td -I=include -I=lib/Target/X86
- Register Relative class
- include/llvm/Target/Target.td
- SubRegIndex
- identify sub-register
- Register
- specify register name and sub-reg relation
- RegisterClass
- to specify register class for instruction operand constraint including
- regType
- alignment
- register-list (register allocation order)
- alternative (register allocation) order
- to specify register class for instruction operand constraint including
- SubRegIndex
- include/llvm/Target/Target.td
Aarch64 Frame Stack
+-------------------------------+ | | | incoming stack arguments | | | +-------------------------------+ <-- arg_pointer_rtx \ | | \ | callee-allocated save area | | for register varargs | | | +-------------------------------+ original_frame_size | | | local variables | | | / +-------------------------------+ <-- frame_pointer_rtx/ | | | callee-saved registers | | | +-------------------------------+ | LR' | +-------------------------------+ | FP' | P +-------------------------------+ <-- hard_frame_pointer_rtx | dynamic allocation | +-------------------------------+ | | | outgoing stack arguments | | | +-------------------------------+ <-- stack_pointer_rtx Dynamic stack allocations such as alloca insert data at point P. They decrease stack_pointer_rtx but leave frame_pointer_rtx and hard_frame_pointer_rtx unchanged. */ /* sub sp, sp, #<frame_size> stp {fp, lr}, [sp, #<frame_size> - 16] add fp, sp, #<frame_size> - hardfp_offset stp {cs_reg}, [fp, #-16] etc. sub sp, sp, <final_adjustment_if_any> */
Prologue Generation Note
- aarch64_expand_prologue (void)
/* The reason for following code is to split the sp adjust timing which could make the store pair code-gen for callee save could meet the store pair offset constraint. */ offset = frame_size = total frame size; fp_offset = (offset - original_frame_size - cfun->machine->frame.saved_regs_size); /* limit to 512 because it's load/store pair offset limit. */ if (offset < 512) { if (frame_pointer_needed) { if (fp_offset > 0) { 1) adjust sp (total frame_size) 2) stp x29, x30, [sp, 0] } else { 1) stp x29, x30, [sp, -48(adjust sp)]! } 3) initial FP value; } else { 1) adjust sp (total frame_size) } 5) store pair for callee save registers call aarch64_save_or_restore_callee_save_registers } else { fp_offset = 0; offset = original_frame_size + cfun->machine->frame.saved_regs_size; if (offset >= 512) offset = cfun->machine->frame.saved_regs_size; frame_size -= (offset + crtl->outgoing_args_size); if (split could make offset < 512) { split frame to 2 part: ------------ | offset | ------------ | outgoing | ------------ 1) generate offset relative code like if (offset <512) above 2) generate sp adjust for outgoing } else { split frame to 3 part: ------------ |frame_size| ------------ | offset | ------------ | outgoing | ------------ 1) generate frame_size adjust code if (frame_size >= 0x1000000) { a. set frame_size to r16(IP0) b. adjust sp by sub r16 } else { a. sub sp ,{frame_size & ~(0xfff)} b. sub sp ,{frame_size & 0xfff} the benifit to split high/low part of sp adjust: each part have chance execute by one sub. } 2) generate offset part like if (offset <512) above 3) generate sp adjust for outgoing } }
Epilogue Generation Note
- aarch64_expand_epilogue (bool for_sibcall)
if (offset >= 512) { if (!frame_pointer_needed && crtl->outgoing_args_size > 0) { sp = sp + {outgoing_args_size} } } if (frame_pointer_needed && (crtl->outgoing_args_size || cfun->calls_alloca)) { sp = fp - fp_offset cfa_reg = stack_pointer_rtx; (it seems for helping debug message generation) } /* generate callee save registers pop */ aarch64_save_or_restore_callee_save_registers (fp_offset + cfun->machine->frame.hardfp_offset, 1); if (offset > 0) { if (frame_pointer_needed) { if (fp_offset > 0) { 1) ldp x29, x30, [fp, fp_offset] 2) sp = sp + offset } else { 1) ldp x29, x30, [sp, offset]! 2) add_reg_note of the 1) insn as REG_CFA_ADJUST_CFA } } else { 1) sp = sp + offset } } /* Stack adjustment for exception handler. */ if (crtl->calls_eh_return) { /* We need to unwind the stack by the offset computed by EH_RETURN_STACKADJ_RTX. However, at this point the CFA is based on SP. Ideally we would update the SP and define the CFA along the lines of: SP = SP + EH_RETURN_STACKADJ_RTX (regnote CFA = SP - EH_RETURN_STACKADJ_RTX) However the dwarf emitter only understands a constant register offset. The solution chosen here is to use the otherwise unused IP0 as a temporary register to hold the current SP value. The CFA is described using IP0 then SP is modified. */ insn = emit_move_insn (IP0, SP); add_reg_note of insn as REG_CFA_DEF_CFA RTX_FRAME_RELATED_P (insn) = 1; sp = sp + EH_RETURN_STACKADJ_RTX } if (the frame split to 3 part) { ------------ |frame_size| ------------ | offset | ------------ | outgoing | ------------ in this time outgoing and offset have been adjust generate frame_size adjust code if (frame_size >= 0x1000000) { a. set frame_size to r16(IP0) b. adjust sp by add r16 } else { a. add sp ,{frame_size & ~(0xfff)} b. add sp ,{frame_size & 0xfff} the benifit to split high/low part of sp adjust: each part have chance execute by one add. } } emit_use (LR) if (!for_sibcall) emit ret;
Probability Theory and Statistics Note
- Bayes' theorem
- Categorical distribution
- http://en.wikipedia.org/wiki/Categorical_distribution
- Categorical distribution is the generalize form of Bernoulli distribution
- Bernoulli distribution
- Indicator function
- Nonparametric statistics
- Hamming distance
- k-nearest neighbors algorithm
- Cross-validation
- Principal component analysis