Start GCC Porting

  • add target prefix in config.sub
  • add cpu_type ,tm_file and extra_part in gcc/config.gcc
    • tm_file: target make file including header file and t-{target}
      • t-{target} specify how to make target specify function and library
    • extart_part: extra object file for the target compiler
  • Build cc1 only for testing
CUR_DIR=`pwd`

$CUR_DIR/../../configure \
--target=nds32le-elf \
--prefix=$CUR_DIR/../../toolchain \
 --disable-nls \
 --disable-shared \
 --enable-languages=c \
 --enable-__cxa_atexit\
 --enable-c99 \
 --enable-long-long \
 --enable-threads=single \
 --with-newlib \
 --disable-lto \
 --disable-multilib \
 --disable-libssp \
 --disable-libgomp \
 --disable-decimal-float \
 --disable-libffi \
 --disable-libmudflap \
 --with-gmp=/home/cychen/source/host-tools \
 --with-mpfr=/home/cychen/source/host-tools \
 --with-mpc=/home/cychen/source/host-tools \
 --with-ppl=/home/cychen/source/host-tools \
 --with-cloog=/home/cychen/source/host-tools \
 --with-host-libstdcxx='-lstdc++ -lm'

   make all-gcc LDFLAGS=-static CFLAGS='-g3 -O0'

   make install-gcc

Storage Layout relative macro

Define this macro to have the value 1 if the most significant bit in a byte has the
lowest number
#define BITS_BIG_ENDIAN
Define this macro to have the value 1 if the most significant byte in a word has the
lowest number
#define BYTES_BIG_ENDIAN
Define this macro to have the value 1 if, in a multiword object, the most significant
word has the lowest number
#define WORDS_BIG_ENDIAN
/* Width of a word, in units (bytes).  */
#define UNITS_PER_WORD 4
/* Define this macro if it is advisable to hold scalars in registers
   in a wider mode than that declared by the program.  In such cases,
   the value is constrained to be within the bounds of the declared
   type, but kept valid in the wider mode.  The signedness of the
   extension may differ from that of the type.  */
#define PROMOTE_MODE(MODE,UNSIGNEDP,TYPE) \
if (GET_MODE_CLASS (MODE) == MODE_INT           \
    && GET_MODE_SIZE (MODE) < UNITS_PER_WORD)   \
{                                               \
  (MODE) = SImode;                              \
}
  • promote scalar value in registers from narrower mode to SImode
Normal alignment required for function parameters on the stack, in bits
#define PARM_BOUNDARY 32
Define this macro to the minimum alignment enforced by hardware for the stack
pointer on this machine
#define STACK_BOUNDARY 32/64
Alignment required for a function entry point, in bits.
#define FUNCTION_BOUNDARY 32
Biggest alignment that any data type can require on this machine, in bits
#define BIGGEST_ALIGNMENT 32/64
/* Alignment of field after `int : 0' in a structure.  */
#define EMPTY_FIELD_BOUNDARY 32
/* A bit-field declared as `int' forces `int' alignment for the struct.  */
#define PCC_BITFIELD_TYPE_MATTERS 1
/* Every structure's size must be a multiple of this.  */
#define STRUCTURE_SIZE_BOUNDARY 8
Define this macro to be the value 1 if instructions will fail to work if given data not
on the nominal alignment
#define STRICT_ALIGNMENT
/* The best alignment to use in cases where we have a choice.  */
#define FASTEST_ALIGNMENT 32
/* Make strings word-aligned so strcpy from constants will be faster.  */
#define CONSTANT_ALIGNMENT(EXP, ALIGN)  \
  ((TREE_CODE (EXP) == STRING_CST       \
    && (ALIGN) < FASTEST_ALIGNMENT)     \
   ? FASTEST_ALIGNMENT : (ALIGN))
/* Make arrays of chars word-aligned for the same reasons.  */
#define DATA_ALIGNMENT(TYPE, ALIGN)             \
  (TREE_CODE (TYPE) == ARRAY_TYPE               \
   && TYPE_MODE (TREE_TYPE (TYPE)) == QImode    \
   && (ALIGN) < FASTEST_ALIGNMENT ? FASTEST_ALIGNMENT : (ALIGN))
/* Layout of source language data types.  */

#define SHORT_TYPE_SIZE         16
#define INT_TYPE_SIZE           32
#define LONG_TYPE_SIZE          32
#define LONG_LONG_TYPE_SIZE     64
#define FLOAT_TYPE_SIZE         32
#define DOUBLE_TYPE_SIZE        64
#define LONG_DOUBLE_TYPE_SIZE   64

/* Define this as 1 if `char' should by default be signed; else as 0.  */
#define DEFAULT_SIGNED_CHAR 1

#define SIZE_TYPE "long unsigned int"
#define PTRDIFF_TYPE "long int"
#define WCHAR_TYPE "short unsigned int"
#define WCHAR_TYPE_SIZE 16

Stack layout relative macro

/* Define this macro if pushing a word onto the stack moves the stack
   pointer to a smaller address.  */
#define STACK_GROWS_DOWNWARD
/* Define this to nonzero if the nominal address of the stack frame
   is at the high-address end of the local variables;
   that is, each additional local variable allocated
   goes at a more negative offset in the frame.  */
#define FRAME_GROWS_DOWNWARD
/* Offset within stack frame to start allocating local variables at.
   If FRAME_GROWS_DOWNWARD, this is the offset to the END of the
   first local allocated.  Otherwise, it is the offset to the BEGINNING
   of the first local allocated.  */
#define STARTING_FRAME_OFFSET
  • Offset from the frame pointer to the first local variable slot to be allocated
/* Offset from the stack pointer register to the first location at which
   outgoing arguments are placed.  */
#define STACK_POINTER_OFFSET
/* Offset from the argument pointer register to the first argument */
#define FIRST_PARM_OFFSET(FNDECL)
  • STACK_DYNAMIC_OFFSET
    • Offset from the stack pointer register to an item dynamically allocated on the stack, e.g., by alloca
    • default after outgoing arguments
  • DYNAMIC_CHAIN_ADDRESS
    • A C expression whose value is RTL representing the address in a stack frame where the

pointer to the caller’s frame is stored.

/* return address rtx for __builtin_return_address */
#define RETURN_ADDR_RTX nds32_return_addr_rtx
  • parameter count means return n-th caller's return address
  • in GCC 4.6.2, arm and mips not support count > 0
  • count == 0 return the current function return address, which is in register for most of targets.
  • Stack layout
 ---------------- hard fp
 [ callee saved ]
 ---------------- pseudo fp
 [ local vars   ]
 [ outgoing args]
 ---------------- sp

Basic Characteristics of Registers

/* hardware register number from 0 to a-1 */
#define FIRST_PSEUDO_REGISTER a
/* How to refer to registers in assembler output.
   This sequence is indexed by compiler's hard-register-number .  */
#define REGISTER_NAMES \
{"$r0", "$r1", "$r2", "$r3", "$r4", "$r5", "$r6", "$r7",                \
 "$r8", "$r9", "$r10","$r11","$r12","$r13","$r14","$r15",               \
 "$r16","$r17","$r18","$r19","$r20","$r21","$r22","$r23",       \
  ..
}
/* value 1 means not available for register allocation */
#define FIXED_REGISTERS                         \
  {                                             \
 /* r0  r1  r2  r3  r4  r5  r6  r7  r8  r9   */ \
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,      \
 /* r10 r11 r12 r13 r14 r15 r16 r17 r18 r19  */ \
    0,  0,  0,  0,  0,  1,  0,  0,  0,  0,      \
/* has 1 for each register that is clobbered (in general) by
function calls as well (caller save register) */
#define CALL_USED_REGISTERS                     \
  {                                             \
 /* r0  r1  r2  r3  r4  r5  r6  r7  r8  r9   */ \
    1,  1,  1,  1,  1,  1,  0,  0,  0,  0,      \
 /* r10 r11 r12 r13 r14 r15 r16 r17 r18 r19  */ \
    0,  0,  0,  0,  0,  1,  1,  1,  1,  1,      \
#define REG_ALLOC_ORDER                               \
  {   5,   4,   3,   2,   1,   0,   6,   7,   8,   9, \
     10,  11,  16,  17,  18,  19,  24,  25,  20,  21, \
     22,  23,  12,  13,  14,  28,  30,  26,  27,  31, \
     29,  32,  33,  34,  35,  36,  37,  15,  38,  39, \
  }
enum reg_class
  {
    NO_REGS,
    LO_REGS,
    HI_REGS,
    GENERAL_REGS,
    ...
   }
#define REG_CLASS_NAMES \
  {                     \
    "NO_REGS",          \
    "LO_REGS",          \
    "HI_REGS",          \
    "GENERAL_REGS",     \
     ...
  }
#define REG_CLASS_CONTENTS                                                                                                    \
  {                                                                                                                           \
    { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, /* NO_REGS                                 */ \
    { 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, /* LO_REGS      : 0-15 */ \
    { 0xffff0000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, /* HI_REGS      : 16-31 */ \
    ...
  }
#define IRA_COVER_CLASSES                                                    \
{                                                                            \
  GENERAL_REGS,\
  LIM_REG_CLASSES                                                            \
}
/* Tell IRA to use the order we define rather than messing it up with its
   own cost calculations. 
   (without the macro the cost of callee save reg will increase while cross function call)  */
#define HONOR_REG_ALLOC_ORDER
/* Return number of consecutive hard regs needed starting at reg REGNO
   to hold something of mode MODE.
   This is ordinarily the length in words of a value of mode MODE
   but can be less for certain modes in special long registers.  */
#define HARD_REGNO_NREGS(REGNO, MODE) \
((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)
/* return 1 if the regno is ok to content mode value */
#define HARD_REGNO_MODE_OK(regno, mode) 
/* A C expression that is nonzero if a value of mode mode1 is accessible in mode mode2
without copying. */
/* Tie QI/HI/SI modes together.  */
#define MODES_TIEABLE_P(MODE1, MODE2) \
(GET_MODE_CLASS (MODE1) == MODE_INT             \
 && GET_MODE_CLASS (MODE2) == MODE_INT          \
 && GET_MODE_SIZE (MODE1) <= UNITS_PER_WORD     \
 && GET_MODE_SIZE (MODE2) <= UNITS_PER_WORD)
/* Return class name for regno */
#define REGNO_REG_CLASS(regno) \
  (regno <= 15 ? LO_REGS : HI_REGS)
/* Return the maximum number of consecutive registers
   needed to represent mode MODE in a register of class CLASS.  */
#define CLASS_MAX_NREGS(CLASS, MODE) \
((GET_MODE_SIZE (MODE) + UNITS_PER_WORD - 1) / UNITS_PER_WORD)

Debugging Support

  • INCOMING_RETURN_ADDR_RTX
    • the way find return address before prolog to support call frame debug
    • If this RTL is a REG, you should also define DWARF_FRAME_RETURN_COLUMN to DWARF_FRAME_REGNUM (REGNO)
  • EH_RETURN_DATA_REGNO (N)
    • define the register to pass exception handle data
    • You must define this macro if you want to support call frame exception handling like

that provided by DWARF 2.

  • EH_RETURN_STACKADJ_RTX
    • adjustment to be applied before function return. This is used to unwind the stack to

an exception handler’s call frame. It will be assigned zero on code paths that return normally.

  • DWARF_FRAME_REGISTERS
  • DWARF_FRAME_REGNUM

Instruction pattern in GCC

  • There are three main conversions that happen in the compiler:
    1. The front end reads the source code and builds a parse tree.
    2. The parse tree is used to generate an RTL insn list based on named instruction patterns.
    3. The insn list is matched against the RTL templates to produce assembler code.
  • GCC定义了一些ISA上常见的instruction RTL pattern称为named pattern
    • 在parsing tree转换成RTL(register transfer language)的phase,GCC会用named RTL pattern相对应的parsing条件做比对,然后转换成named RTL pattern
    • named RTL pattern 可以用define_insndefine_expand定义
    • 在GCC build的过程中会根据RTL pattern在build folder下generate出insn-emit.c
  • 指令在都转换成RTL后,会经过一连串optimization,最后由define_insn match RTL pattern,然后print out assembly code.
    • 这时可以用nameless的define_insn pattern做assembly code matching
    • namelessdefine_insn,为了debug方便,取名为*开头的pattern name
      • 这使GCC知道它是nameless pattern,同时在debug过程中我们也可以知道它是match到哪个rtl pattern
    • nameless的pattern不会用在rtl的生成,仅用在assembly code的matching
Here is an actual example of an instruction pattern, for the 68000/68020.
(define_insn "tstsi" <= pattern name
[(set (cc0)
(match_operand:SI 0 "general_operand" "rm"))]
"" <= match condition      /\          /\
"*                         ||          ||  
{                       predicate   constraint
if (TARGET_68020 || ! ADDRESS_REG_P (operands[0]))
   return \"tstl %0\";  <= output assembly code
return \"cmpl #0,%0\";  <= %0 means print out operands[0]
}")
  • (match_operand:m n predicate constraint)
    • m: operand mode
    • n: operands number
    • predicate: operand n matching的条件,唯有满足这个条件才会match到这个rtl pattern
    • constraint: 可以由constraint来设定ISA上这个operands的限制,如果在optimization的过程有产生不符合的RTL,最后会由reload来解决,同时也会影响到register allocation的结果
    • predicate必须包含constraint,否则可能reload出不符predicate的pattern使compiler crash
  • (match_scratch:m n constraint)
    • could use it when the pattern need temporary register
    • If the last few expressions in a parallel are clobber expressions whose

operands are either a hard register or match_scratch, the combiner can add or delete them when necessary

  • (match_dup n)
    • It is used when the operand needs to appear more than once in the insn
    • should not use match_dup when the operands is both an input operand and the output operand
  • (match_operator:m n predicate [operands...])
    • match operator
    • There is no way to specify constraints in match_operator. The operand of the insn which corresponds to the match_operator never has any constraints, because it is never reloaded as a whole
    • it usually best if the operand number of the match_operator is higher than that of the actual operands of the insn. This improves register allocation because the register allocator often looks at operands 1 and 2 of insns to see if it can do register tying.
  • Back trace of expand RTL in GCC
#0  expand_expr_real (exp=0xb7ff3900, target=0x0, tmode=VOIDmode, modifier=EXPAND_NORMAL, alt_rtl=0x0) at /ho
me/cychen/source/462/work/build/../../gcc/expr.c:7221
#1  0x081b1a24 in expand_normal (exp=0xb7ff3900) at /home/cychen/source/462/work/build/../../gcc/expr.h:428
#2  0x081ba7b7 in expand_assignment (to=0xb7f61348, from=0xb7e27f60, nontemporal=0 '\000') at /home/cychen/so
urce/462/work/build/../../gcc/expr.c:4228
#3  0x081167fc in expand_gimple_stmt_1 (stmt=0xb7e21630) at /home/cychen/source/462/work/build/../../gcc/cfge
xpand.c:1975
#4  0x08116b2e in expand_gimple_stmt (stmt=0xb7e21630) at /home/cychen/source/462/work/build/../../gcc/cfgexp
and.c:2084
#5  0x0811b1c3 in expand_gimple_basic_block (bb=0xb7f81fc0) at /home/cychen/source/462/work/build/../../gcc/c
fgexpand.c:3626
#6  0x0811c0b9 in gimple_expand_cfg () at /home/cychen/source/462/work/build/../../gcc/cfgexpand.c:4109
  • The "movm" pattern with constraint "r" must have "m" constraint, or else it have chance to reload fail.
  • When generate RTL, operand match only consider predicate.
  • When optimization, after code motion will call extract_insn(rtx insn) which in recog.c
    • extract_insn will call recog_memoized first to recognize pattern by predicate.
      • which eventually will call recog defined in insn-recog.c
        • insn-recog.c is generated when building gcc according to *.md file
    • and then will determine the RTL pattern belong to which alternative by the pattern constraint

Operand Constraints

  • m: memory operands, the letter could re-defined by TARGET_MEM_CONSTRAINT
  • o: offsettable memory operand
  • V: not-offsettable memory operand
  • <: auto-decrement(could be predecremnt or postdecremnt) addressing
  • >: auto-increment(could be preincremnt or postincremnt) addressing
  • r: register operand is allowed provided that it is in a general register
  • i: immediate integer operand (one with constant value) is allowed. This includes symbolic constants
  • n: immediate integer operand which less than a word wide
  • E, F: immediate floating operand
  • G, H: machine-dependent floating operands
  • s: CONST_INT || CONST_DOUBLE || VOIDmode , not including symbol
  • g: Any register, memory or immediate integer operand is allowed, except for registers that are not general registers
  • X: It use when one of the alternative need scratch register but tho other don't
(define_insn "*andsi3_compare0_scratch"
  [(set (reg:CC_NOOV CC_REGNUM)
        (compare:CC_NOOV
         (and:SI (match_operand:SI 0 "s_register_operand" "r,r")
                 (match_operand:SI 1 "arm_not_operand" "rI,K"))
         (const_int 0)))
   (clobber (match_scratch:SI 2 "=X,r"))] <== no need scratch in alter 0
  "TARGET_32BIT"
  "@
   tst%?\\t%0, %1
   bic%.\\t%2, %0, #%B1"
  [(set_attr "conds" "set")]
)
  • 0->9: match with operand 0->9
  • p: address operand allow by TARGET_LEGITIMATE_ADDRESS_P

Constraint Modifier Characters

  • ?: slightly increase the cost of this alternative
  • !: severely increase the cost of this alternative, still have change to use if the constraint meet without reloading
  • =: output operand
  • +: in/output operand
  • &: earlyclobber operand, used in output operand when we want output operands not the same with any input operand
  • %: Declares the instruction to be commutative, then GCC will try to interchange the two operands to fit the constraint
    • should only use in the operator meet commutative law, e.g. add, mul, and.
(define_insn "addhi3"
  [(set (match_operand:HI 0 "general_operand" "=m,r")
        (plus:HI (match_operand:HI 1 "general_operand" "%0,0")
                 (match_operand:HI 2 "general_operand" "di,g")))]
...)
  • #: Only use the constraint for choosing register preferences
  • *: Not use for choosing register preferences and no effect on reloading.

Defining Machine-Specific Predicates and Constraints

  • Use define_predicate and define_special_predicate define machine specific predicate
    • define_predicate will do mode test and define_special_predicate will not
    • mode test: (mode == VOIDmode || GET_MODE (op) == mode);
(define_predicate "arm_float_compare_operand"
  (if_then_else (match_test "TARGET_VFP")
                (match_operand 0 "vfp_compare_operand")
                (match_operand 0 "arm_float_rhs_operand")))

;; True for valid index operands.
(define_predicate "index_operand"
  (ior (match_operand 0 "s_register_operand")
       (and (match_operand 0 "immediate_operand")
            (match_test "(GET_CODE (op) != CONST_INT
                          || (INTVAL (op) < 4096 && INTVAL (op) > -4096))"))))

;; True for operators that can be combined with a shift in ARM state.
(define_special_predicate "shiftable_operator"
  (and (match_code "plus,minus,ior,xor,and")
       (match_test "mode == GET_MODE (op)")))
  • Use define_register_constraint, define_constraint, define_memory_constraint define machine specific constraint
(define_register_constraint "l" "TARGET_THUMB ? LO_REGS : GENERAL_REGS"
 "In Thumb state the core registers r0->r7.")

(define_constraint "M"
 "In Thumb-1 state a constant that is a multiple of 4 in the range 0-1020."
 (and (match_code "const_int")
      (match_test "TARGET_32BIT ? ((ival >= 0 && ival <= 32)
                                 || (((ival & (ival - 1)) & 0xFFFFFFFF) == 0))
                   : ival >= 0 && ival <= 1020 && (ival & 3) == 0")))

(define_memory_constraint "Uv"
 "@internal
  In ARM/Thumb-2 state a valid VFP load/store address."
 (and (match_code "mem")
      (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, FALSE)")))
  • constraint character for what kind constraint have been specify in reload.c: find_reloads()
    • generic constraint letters: 'E F V X g i m n o p r s'
      • Should not use generic constraint letters to define machine specific constraint
    • G, H: CONST_DOUBLE
    • I, J, K, L, M, N, O, P: CONST_INT

GCC 3.x register allocation

  1. regmove: generate move to satisfy constraint and delete the redundant instructions
  2. regclass: select preferred reg class and alternative reg class for register allocation
  3. local allocator: register allocation of live time with in a basic block
  4. global allocator: register allocation cross blocks (Chow's priority based colouring)
  5. reload: transfer rtl to satisfy operand constraint
    1. indirect code selection
    2. elimination of virtual hardware reg ex: argument pointer
    3. assign stack slot for spill code
    4. generate save/restore code for function call
    5. copy propagation
    6. simple rematerialization (如果重计算比较划算 就重算不做load/store)
    7. address inheritance (move2add) 减小 address reference 的offset
  6. post reload: remove redundent move at the end

GCC 4.x Integrated register allocation

  • color priority的计算方式
priority = ( (live time上用到的hard reg數目) * (freq 這條life time會被執行到的傾向) 
* (log2 被指令使用到的次數) ) / (live time的長度(經過幾個rtl insn))
  • IRA是一个regional allocator 也就是会将live time根据region切成几个allocno 在对每个allocno做allocation
  • region的切法 (可由 -fira-region= 设定):
  1. all (所有loop: IRA_REGION_ALL)
  2. mixed(所有loop除了register pressure很小的loop: IRA_REGION_MIXED is default value)
  3. one 一个function 当一个region (IRA_REGION_ONE)
  • passes in IRA
  1. build IR: regions, allocnos, copies, costs
  2. coloring
  3. spill/restore points move
  4. emitting new registers and code fore register shuffling
  5. caller save optimizations
  6. rebuild IR
  7. reload
  8. back to coloring if need
  • IRA可以定义cover class将register分类 然后根据architecture的ISA性质 给cost
  1. use IRA_COVER to define, and won't need after GCC 4.7
  2. 可以将allocno限制只分配给他cover class内的reg
  3. hard-register cost:用一个vector纪录 他的大小和cover class内的hardware reg数目相同

会利用到 register_move_cost来计算这个vecter

  • IRA 会用到的几个structure
  1. allocno: 在一个region内的pseudo reg life time
  2. conflicting cllocnos
  3. conflicting hard reg
  • 当allocate到caller save reg且cross function call时 cost增加 (增加spill的倾向)
    • ira-costs.c: ira_tune_allocno_costs_and_cover_classes()
/* Change hard register costs for allocnos which lives through
   function calls.  This is called only when we found all intersected
   calls during building allocno live ranges.  */
 if (! ira_hard_reg_not_in_set_p (regno, mode, call_used_reg_set)
                  || HARD_REGNO_CALL_PART_CLOBBERED (regno, mode))
                cost += (ALLOCNO_CALL_FREQ (a)
                         * (ira_memory_move_cost[mode][rclass][0]
                            + ira_memory_move_cost[mode][rclass][1]));
#ifdef IRA_HARD_REGNO_ADD_COST_MULTIPLIER
              cost += ((ira_memory_move_cost[mode][rclass][0]
                        + ira_memory_move_cost[mode][rclass][1])
                       * ALLOCNO_FREQ (a)
                       * IRA_HARD_REGNO_ADD_COST_MULTIPLIER (regno) / 2);
#endif
  • allocno到callee save reg时,由于prolog/epilog可能需push/push所以增加cost
    • ira-color.c: assign_hard_reg
    • HONOR_REG_ALLOC_ORDER only exist in gcc 4.6 or after
#ifndef HONOR_REG_ALLOC_ORDER
      if (! allocated_hardreg_p[hard_regno]
          && ira_hard_reg_not_in_set_p (hard_regno, mode, call_used_reg_set))
        /* We need to save/restore the hard register in
           epilogue/prologue.  Therefore we increase the cost.  */
        {
          /* ??? If only part is call clobbered.  */
          rclass = REGNO_REG_CLASS (hard_regno);
          add_cost = (ira_memory_move_cost[mode][rclass][0]
                      + ira_memory_move_cost[mode][rclass][1] - 1);
          cost += add_cost;
          full_cost += add_cost;
        }
#endif
  • conflict hard-register costs:和hard-register cost vector同大小

对于还没分配到allocno 用这个vecter纪录目前 get到某个hard reg所需的cost

  • 当同一条pseudo reg的某个allocno挑到一个hard reg时 选那个hard reg的cost会降低

使得同一条pseudo reg的其他allocno倾向选同一个hard reg来减少move

  • IRA会倾向将output operand和input operand挑同一个reg
  • cap: nested region里个allocno 针对cap的情况做了些判断
  • 一个allocno被称为"trivially colourable" 只有当 conflict hard reg num + 储存这个 allocno所需的hard reg数目 < available hard reg in same cover class 也就是说一定可以图完色
    • ira-color.c: put_allocno_into_bucket()
  if (ALLOCNO_LEFT_CONFLICTS_NUM (allocno)
      + ira_reg_class_nregs[cover_class][ALLOCNO_MODE (allocno)]
      <= ALLOCNO_AVAILABLE_REGS_NUM (allocno))
    add_allocno_to_bucket (allocno, &colorable_allocno_bucket);
  • IRA分为两个PASS
  1. pass1: 将trivially colourable allocno放进stack 然后剩下的再计算cost放进stack
    • memory cost - reg cost +
      1. 0 for cap
      2. move cost for hard reg
      3. 减load/store cost for memory
    • 是考虑到上层region的allocate情况对cost做改变
    • pass1将最少用到的allocno先放进stack 在pass2时就会将 最常用到的allocno先pop出来做allocate (Chaitin-Briggs colouring的重要改变)
  2. pass2: allocate allocno to a hard reg 没成功或cost太高则spill到 memory
    • 利用下面的cost function决定要allocate哪个hard reg
      • cost(allocno, hard reg) = conflictAllocnoCost(allocno, hard reg) + CopyAllocnoCost(allocno, hard reg)
    • assign allocno到hard reg的cost = 所有和这个allocno conflict的ConflictCost(allocno, hard reg)

加上 所有和这个allocno相连的allocno的copy cost (用来表达 当assign不同hard reg时额外的move cost)

    • 当assign完后和这个allocno相连的allocno的Copy cost会降低 (因为不需要copy嘛 如果选同个reg的话)
  • IRA有个special pass来改变是mem 或 reg 目的是避免太内部的loop有太多的load/store
  • IRA会assign新的pseudo-register给每个allocno 为的是减少reload发生spill时的影响范围

因此造成真正的move的机率很低 就算发生也比整个pseudo-register都spill要来的好

  • IRA有做caller save optimization来减少load/store
  • 在IRA产生save/restore比在reload好

是因为都还是pseudo reg 要做任何简化都比较容易

  • IRA会试著使用同个stack slot来消除memory-memory move.
  • reload会负责分配stack slot

会试著共用stack slot但只针对被reload spill出来的pseudo reg

  • IRA对stack slot的分配
    • 当有FP时 将常用的pseudo先分配到stack slot
    • 当只有SP时 将少用的先分配到stack slog
      1. 消除momery to memory move
      2. 减小fp+offset和sp+offset的offset

GCC 4.x IRA select Best class for allocno

  • Best class will select from important classes
    • important class are the subclasses of cover class
    • that is REG_CLASS we define in machine.h and also a subclass of cover class we define in IRA_COVER_CLASSES
    • IRA_COVER_CLASSES is remove from GCC 4.7
  • Build register class move_cost table for IRA
  • use REGISTER_MOVE_COST porting function to build simple one last_move_cost table
    • reginfo.c: init_move_cost (mode)
for (i = 0; i < N_REG_CLASSES; i++)
    if (contains_reg_of_mode[i][m])
      for (j = 0; j < N_REG_CLASSES; j++)
        {
          int cost;
          if (!contains_reg_of_mode[j][m])
            cost = 65535;
          else
            {
              cost = REGISTER_MOVE_COST (m, i, j);
              gcc_assert (cost < 65535);
            }
          all_match &= (last_move_cost[i][j] == cost);
          last_move_cost[i][j] = cost;
        }
  • use last_move_cost table build (1) move_cost (2) may_move_in_cost (3) may_move_out_cost
    • also in init_move_cost (mode)
              cost = last_move_cost[i][j];

              for (p2 = &reg_class_subclasses[j][0];            
                   *p2 != LIM_REG_CLASSES; p2++)
                if (*p2 != i && contains_reg_of_mode[*p2][m])
                  cost = MAX (cost, move_cost[m][i][*p2]); /* cost is the max cost of class i move to all possible subclass of j */

              for (p1 = &reg_class_subclasses[i][0];
                   *p1 != LIM_REG_CLASSES; p1++)
                if (*p1 != j && contains_reg_of_mode[*p1][m])
                  cost = MAX (cost, move_cost[m][*p1][j]); /* cost is the max cost of from all possible subclass of i
                                                              move to all possible subclass of j */

              gcc_assert (cost <= 65535);
              move_cost[m][i][j] = cost;

              if (reg_class_subset_p (i, j))
                may_move_in_cost[m][i][j] = 0;    /* when move from subclass may_move_in_cost get cost 0 
              else                                   this encourage input operand select bigger class */
                may_move_in_cost[m][i][j] = cost;

              if (reg_class_subset_p (j, i))
                may_move_out_cost[m][i][j] = 0;   /* when move to subclass may_move_out cost get cost 0
              else                                   this encourage output operand select smaller class */
                may_move_out_cost[m][i][j] = cost;
  • calculate reg_class move cost for each allocno
    • ira-costs.c: find_allocno_class_costs()
/* Find costs of register classes and memory for allocnos and their
   best costs. */
  • which will scan each insn's operand to calculate cost
    • call graph would be
    find_allocno_class_costs()
      -> process_bb_node_for_costs ()
         -> scan_one_insn ()
            -> record_operand_costs ()
               -> record_reg_classes ()
  • record_reg_classes ()
    • process constraint to calculate each cost_class(important class) cost for each operand
                case '*': /* the constraint after '*' not influence best class selection */
                  /* Ignore the next letter for this pass.  */
                  c = *++p;
                  break;

                case '?': /* increase cost for the constraint after '?' */
                  alt_cost += 2;
                case '!':  case '#':  case '&':
                case '0':  case '1':  case '2':  case '3':  case '4':
                case '5':  case '6':  case '7':  case '8':  case '9':
                  break;

                case 'p':
                  allows_addr = 1;
                  win = address_operand (op, GET_MODE (op));
                  /* We know this operand is an address, so we want it
                     to be allocated to a register that can be the
                     base of an address, i.e. BASE_REG_CLASS.  */
                  classes[i]
                    = ira_reg_class_union[classes[i]]
                      [base_reg_class (VOIDmode, ADDRESS, SCRATCH)];
                  break;

                case 'm':  case 'o':  case 'V':
                  /* It doesn't seem worth distinguishing between
                     offsettable and non-offsettable addresses
                     here.  */
                  insn_allows_mem[i] = allows_mem[i] = 1;
                  if (MEM_P (op))
                    win = 1;
                  break;
  • calculate cost for operand i
                  struct costs *pp = this_op_costs[i];

                  for (k = 0; k < cost_classes_num; k++)
                    {
                      rclass = cost_classes[k]; /* rclass is any possible important class */
                      pp->cost[k]
                        = (((recog_data.operand_type[i] != OP_OUT
                             ? ira_get_may_move_cost (mode, rclass,          /* will use may_move_in_cost[] */
                                                      classes[i], true) : 0) /* classes[i] is the constraint we fill for operand i */
                            + (recog_data.operand_type[i] != OP_IN
                               ? ira_get_may_move_cost (mode, classes[i],    /* will use may_move_out_cost[] */
                                                        rclass, false) : 0))
                           * frequency); /* frequency present the execution times of basic block */
                    }
  • Final adjust best cost for each operand
      op_cost_add = alt_cost * frequency;
      /* Finally, update the costs with the information we've
         calculated about this alternative.  */
      for (i = 0; i < n_ops; i++)
        if (REG_P (ops[i]) && REGNO (ops[i]) >= FIRST_PSEUDO_REGISTER)
          {
            struct costs *pp = op_costs[i], *qq = this_op_costs[i];
            int scale = 1 + (recog_data.operand_type[i] == OP_INOUT);

            pp->mem_cost = MIN (pp->mem_cost,
                                (qq->mem_cost + op_cost_add) * scale);

            for (k = 0; k < cost_classes_num; k++)
              pp->cost[k]
                = MIN (pp->cost[k], (qq->cost[k] + op_cost_add) * scale);
          }
  • definition of ALLOCNO_BAD_SPILL_P
    • when op is a pseudo register and insn not allow memory
    • allocate hard reg fail will generate spill code
  for (i = 0; i < n_ops; i++)
    {
      ira_allocno_t a;
      rtx op = ops[i];

      if (! REG_P (op) || REGNO (op) < FIRST_PSEUDO_REGISTER)
        continue;
      a = ira_curr_regno_allocno_map [REGNO (op)];
      if (! ALLOCNO_BAD_SPILL_P (a) && insn_allows_mem[i] == 0)
        ALLOCNO_BAD_SPILL_P (a) = true;
    }
  • scan_one_insn()
    • accumulate reg_class_cost of each operand to COSTS_OF_ALLOCNO
  record_operand_costs (insn, op_costs, allocno_pref);

  /* Now add the cost for each operand to the total costs for its
     allocno.  */
  for (i = 0; i < recog_data.n_operands; i++)
    if (REG_P (recog_data.operand[i])
        && REGNO (recog_data.operand[i]) >= FIRST_PSEUDO_REGISTER)
      {
        int regno = REGNO (recog_data.operand[i]);
        struct costs *p
          = COSTS_OF_ALLOCNO (allocno_costs,
                              ALLOCNO_NUM (ira_curr_regno_allocno_map[regno]));
        struct costs *q = op_costs[i];

        p->mem_cost += q->mem_cost;
        for (k = 0; k < cost_classes_num; k++)
          p->cost[k] += q->cost[k];
      }

GCC 3.x Tail call optimization

  • calls.c: expand_call () will fill "CALL PLACEHOLDER"
    • which will contain three versions of RTL instructions to represent the current function call
      1. normal call
      2. tail recursion
      3. sibling call
  • code generation
  1. The caller stores the callee’s arguments in the area for its own incoming arguments. (By definition, this is not part of the epilogue, but without this step the following instructions would not make sense.)
  2. The caller has to pop callee saved registers, should it be required
  3. If not given the option -fomit-frame-pointer, the caller restores the frame and base pointer, by reloading both with their prior values.
  4. Instead of finishing with a generic function call, the callee is invoked via the architecture’s low level jump command to reuse the existing stack frame along with the stack slots reserved for incoming arguments.
  • limitations
  1. No pending clean ups
  2. Matching argument sizes of caller and callee
  3. No variable argument functions
  4. Sibling call epilogue must be defined
  5. No struct returns
  6. Caller and callee must have matching return types
  7. No setjmp and longjmp
  8. No volatile functions
  9. The caller’s stack frame must be empty
  10. No indirect calls
  11. No position independent code
  12. No overlapping arguments
  13. (Partly) Machine-dependent constraints

Sequence point

*p++ = *p + 10;
  • cause *p have higher execution priority
    • lvalue get *p address
  • Howeve, rvalue (*p+10) is execute before *p++ or after ?
    • in fact, "=" is not a sequence point
      • therefore, the execution prority depend on the implementation of the compiler
        • GCC will execute (*p + 10) first
        • OPEN64 will execute (*p++) first
 
Volatile reserve word in C make the statement to be a "sequence point"

Addressing modes

  • have pre/post/inc/dec or not
HAVE_PRE_INCREMENT 
HAVE_PRE_DECREMENT 
HAVE_POST_INCREMENT 
HAVE_POST_DECREMENT
  • have pre/post modify with a "constant" or not
HAVE_PRE_MODIFY_DISP 
HAVE_POST_MODIFY_DISP
  • have pre/post modify with a "reg" or not
HAVE_PRE_MODIFY_REG
HAVE_POST_MODIFY_REG
  • CONSTANT_ADDRESS_P (x): a valid constant address or not
  • CONSTANT_P (x): is symbol_ref, label_ref, high expressions, const arithmetic expressions, const_int , const_double expressions or not.
  • MAX_REGS_PER_ADDRESS: max reg number in one valid memory address
  • TARGET_LEGITIMATE_ADDRESS_P: a valid memory address or not
bool TARGET_LEGITIMATE_ADDRESS_P (enum machine mode mode,
rtx x, bool strict)
  • strict being true means already do register allocation
    • use the array reg_renumber[pseudo regno]=hard regno or -1 for memory
    • use #ifdef REG_OK_STRICT conditional in source file to define the strict variant in that case
  • TARGET_LEGITIMIZE_ADDRESS: return a valid address when original address is invalid
rtx TARGET_LEGITIMIZE_ADDRESS (rtx x, rtx oldx, enum machine mode mode)
  • The compiler has standard ways of doing so in all invalid cases.
  • But often a "machine-dependent strategy" can generate better code
  • LEGITIMIZE_ADDRESS will be called in explow.c: memory_address()
/* Return something equivalent to X but valid as a memory address
   for something of mode MODE.  When X is not itself valid, this
   works by copying X or subexpressions of it into registers.  */
rtx
memory_address (enum machine_mode mode, rtx x) (gcc 4.4.4)
memory_address_addr_space (enum machine_mode mode, rtx x, addr_space_t as) (gcc 4.6.2)
  • memory_address will do
  1. convert address to Pmode (why?)
  2. force constant address to register for better cse chance
  3. call break_out_memory_refs ()
    • force constant_address to reg and simplify the address calculation.
  4. call LEGITIMIZE_ADDRESS
  5. Symplify to (reg/symbol + const) format
  6. force to reg if MULT or MINUS
/* Return a copy of X in which all memory references
   and all constants that involve symbol refs
   have been replaced with new temporary registers.
   Also emit code to load the memory locations and constants
   into those registers.

   If X contains no such constants or memory references,
   X itself (not a copy) is returned.

   If a constant is found in the address that is not a legitimate constant
   in an insn, it is left alone in the hope that it might be valid in the
   address.

   X may contain no arithmetic except addition, subtraction and multiplication.
   Values returned by expand_expr with 1 for sum_ok fit this constraint.  */

static rtx
break_out_memory_refs (rtx x)
  • what arm gcc do in LEGITIMIZE_ADDRESS
    1. transfer legitimize_tls_address to valid
    2. transfer base + big const => base + valid const
    3. transfer invalid big const => base + valid const

Naming pattern

  • cstore<mode>4
    • Store zero or nonzero in operand 0 according to whether a comparison is true
      • Operand 0: comparison result is true or false
      • Operand 1: comparison operator.
      • Operand 2 and operand 3: first and second operand of the comparison.
  • cbranch<mode>4
    • Conditional branch instruction combined with a compare instruction
      • Operand 0:comparison operator
      • Operand 1 and operand 2: first and second operands of the comparison
      • Operand 3: label_ref that refers to the label to jump to
  • call
    • Subroutine call instruction returning no value.
      • Operand 0: function to call
      • Operand 1: number of bytes of arguments pushed as a const_int
      • Operand 2: number of registers used as operands
  • call_value
    • Subroutine call instruction returning a value
      • Operand 0: hard register in which the value is returned
      • Other three operands just like "call"

PC relative jumptable

/* Define this macro to be an expression with a nonzero value if jump tables (for
   tablejump insns) should be output in the text section, along with the assembler
   instructions. Otherwise, the readonly data section is used.  */
#define JUMP_TABLES_IN_TEXT_SECTION 1
  • Use "casesi" to implement dispatch table
    • Instruction to jump through a dispatch table, including bounds checking
      • Operand 0: The index to dispatch on, which has mode SImode
      • Operand 1: The lower bound for the table, an integer constant
      • Operand 2: The total range of the table (range = max index - min index)
      • Operand 3: label of the jumptable
      • Operand 4: label for jump out jumptable when condition not match
  • Condition of jumptable generation
    • in stmt.c: expand_case()
    • when the condition not satisfied which is the case index is too sparse
      • GCC will generate binary search tree for switch statement
If (count >= case_values_threshold ()  &&  (range <= (optimize_insn_for_size_p () ? 3 : 10) * count))
{
  Generate jump_table
}
count: total switch number
range: max_case_index – min_case_index
  • Example
EX:   count = 5 (there are 5 case)
      Range = 14 – 10 = 4
Switch(i)
{
        case 10:
            c = a + b;
            break;
        case 11:
            c = a - b;
            break;
        case 12:
            c = a * b;
            break;
        case 13:
            c = a / b;
            break;
        case 14:
            c = a & b;
            break;
        default:
}
  • GCC default case_values_threshold () is 4 when "casesi" pattern is implement
    • otherwise it would be 5
    • could use TARGET_CASE_VALUES_THRESHOLD change it
  • optimize_insn_for_size_p () will be true when -Os is set
    • Therefore, when –Os condition would be
If (count >= 4  &&  (range <= 3 * count))
{
  Generate jump_table
}
  • Jumptable generation source code
      else if (count < targetm.case_values_threshold ()
               || compare_tree_int (range,
                                    (optimize_insn_for_size_p () ? 3 : 10) * count) > 0
               /* RANGE may be signed, and really large ranges will show up
                  as negative numbers.  */
               || compare_tree_int (range, 0) < 0
#ifndef ASM_OUTPUT_ADDR_DIFF_ELT
               || flag_pic
#endif
               || !flag_jump_tables
               || TREE_CONSTANT (index_expr)
               /* If neither casesi or tablejump is available, we can
                  only go this way.  */
               || (!HAVE_casesi && !HAVE_tablejump))
        {
  • ASM_OUTPUT_ADDR_DIFF_ELT (stream, body, value, rel)
    • generate dispatch table entry as relative address (label difference .L-.L)
    • when ASM_OUTPUT_ADDR_DIFF_ELT not defined, -fpic will not generate jumptable
      • because GCC will use ASM_OUTPUT_ADDR_VEC_ELT generate dispatch table entry
        • which will be absolute address (.L)
  • Define CASE_VECTOR_PC_RELATIVE will use ASM_OUTPUT_ADDR_DIFF_ELT generate tabe entry
    • which will generate PC relative jump table
  • The advantage of PC relative jump table
    • the table entry contain the difference beteen table label and case label
    • when table label and max case label is close enough
    • we could change the table entry mode as QImode to reduce the table size
  • To do so need define CASE_VECTOR_SHORTEN_MODE(min, max, body)
    • which operands are the min case label address and min case label address
      • it calculate by the instruction attribute length defined in md file
      • use this macro to determine the mode of table entry
        • ASM_OUTPUT_ADDR_DIFF_ELT will get the mode and generate right format
  • When the table entry may change mode, the alignment issue should be consider
    • use ADDR_VEC_ALIGN to determine the jumptable alignment
    • use ASM_OUTPUT_CASE_END to determine the alignment after jumptable

Passing Function Arguments

  • ACCUMULATE_OUTGOING_ARGS
/* Define this if the maximum size of all the outgoing args is to be
   accumulated and pushed during the prologue.  The amount can be
   found in the variable crtl->outgoing_args_size.  */
  • REG_PARM_STACK_SPACE
    • define this macro will preserve fixed stack size for arguments, even if passed in registers.
  • OUTGOING_REG_PARM_STACK_SPACE
    • define this macro when preserve fixed stack size for arguments is allocated by caller
  • TARGET_RETURN_POPS_ARGS (caller pop argument stack or callee pop)
    • returns the number of bytes of its own arguments that a function pops on returning
    • or return 0 if the function pops no arguments and the caller must therefore pop them all after the function returns
  • TARGET_FUNCTION_ARG
    • return reg rtx to indicate the argument passed by which register
    • or return 0 indicate passed on the stack
  • TARGET_FUNCTION_ARG_ADVANCE
    • use to update the current argument register usage in structure CUMULATIVE_ARGS
  • TARGET_FUNCTION_ARG_BOUNDARY
    • returns the alignment boundary, in bits, of an argument with the specified mode and type
    • default hook returns PARM_BOUNDARY for all arguments
  • TARGET_ARG_PARTIAL_BYTES
  • returns the number of bytes at the beginning of an argument that must be put in registers
  • return 0 for arguments that are passed entirely in registers or on the stack
  • FUNCTION_ARG_REGNO_P
    • return nonzero if regno is the number of a hard register in which function arguments are sometimes passed
  • TARGET_PASS_BY_REFERENCE
    • return true if an argument at the position indicated by cum should be passed by reference

Frame Pointer Need

  • when frame pointer is needed
    1. No omit frame pointer flag
    2. the function would call alloca and then modify stack frame
    3. enable stack checking flag
    4. accesses_prior_frames
    5. stack need realignment
    6. target defined fp required in TARGET_FRAME_POINTER_REQUIRED
    • defined in ira.c:ira_setup_eliminable_regset()
  /* FIXME: If EXIT_IGNORE_STACK is set, we will not save and restore
     sp for alloca.  So we can't eliminate the frame pointer in that
     case.  At some point, we should improve this by emitting the
     sp-adjusting insns for this case.  */
  int need_fp
    = (! flag_omit_frame_pointer
       || (cfun->calls_alloca && EXIT_IGNORE_STACK)
       /* We need the frame pointer to catch stack overflow exceptions
          if the stack pointer is moving.  */
       || (flag_stack_check && STACK_CHECK_MOVING_SP)
       || crtl->accesses_prior_frames
       || crtl->stack_realign_needed
       || targetm.frame_pointer_required ());

  frame_pointer_needed = need_fp;
  /* Nonzero if function being compiled called builtin_return_addr or
     builtin_frame_address with nonzero count.  */
  bool accesses_prior_frames;
  • in reload pass will try to eliminate fp by the porting target hook
    • TARGET_CAN_ELIMINATE
    • INITIAL_ELIMINATION_OFFSET
  • if eliminate fp fail, frame_pointer_needed will be 1
  • define in reload1.c: update_eliminables()

Anchored addresses

  • GCC anchored addresses optimization could transfer the following code
    • On some targets, it would be better to calculate just one symbolic address and access the three variables relative to it
    • could use "fsection-anchors" to enable if the target have been implement
static int a = 0, b = 1, c = 2;
int foo (void) { return a + b + c; }
  • as
int foo (void)
{
register int *xr = &x;
return xr[&a - &x] + xr[&b - &x] + xr[&c - &x];
}
* NOTE: global variables in a object should be strong symbol, 
        or else it would put into common section instead of anchor section.
  • To enable anchored addresses at last need define following macro
    • SET_ASM_OP: assemlber support for setting an address to symbol
    • TARGET_MAX_ANCHOR_OFFSET: max offset relative to base that permit to generate
    • TARGET_MIN_ANCHOR_OFFSET: min offset relative to base that permit to generate
  • the base would be the anchored section start address.
  • Other anchored addresses relative macro
    • TARGET_ASM_OUTPUT_ANCHOR: how to setting the anchor section start address
    • TARGET_USE_ANCHORS_FOR_SYMBOL_P: what kind symbol could move into anchor section
      • mips use this macro to avoid move GP relative and PC relative symbol into anchor section
  • Limitation
    • Can't handle weak symbol, cause the data type(size) not determinate
    • The effect scope in single compile unit (Not IPA)

Structure Passing as Argument

  • TARGET_PASS_BY_REFERENCE: determine pass by reference or not
    • could use this macro control pass by reg or by reference

MI_THUNK on GCC for C++

  • A wrapper function to implement C++ virtual function calls
  • TARGET_ASM_OUTPUT_MI_THUNK: update "this" pointer by add delta, and then could use "this" pointer to find the real function body for virtual function inheritance.

ARM ABI

       old stack pointer -> |    |
                             ----
                            |    | \
                            |    |   saved arguments for
                            |    |   vararg functions
                            |    | /
                              --
   hard FP & arg pointer -> |    | \
                            |    |   stack
                            |    |   frame
                            |    | /
                              --
                            |    | \
                            |    |   call saved
                            |    |   registers
      soft frame pointer -> |    | /
                              --
                            |    | \
                            |    |   local
                            |    |   variables
     locals base pointer -> |    | /
                              --
                            |    | \
                            |    |   outgoing
                            |    |   arguments
   current stack pointer -> |    | /
                              --
  • ARM define function arm_get_frame_offsets to calculate stack offsets
    • It would be call several times when needs frame info.
  • arm define the structure arm_stack_offsets in machine_function structure
typedef struct GTY(()) arm_stack_offsets
{
  int saved_args;       /* ARG_POINTER_REGNUM.  */ = (args.pretend_args_size for variadic functions) 
  int frame;            /* ARM_HARD_FRAME_POINTER_REGNUM.  */ = (args.pretend_args_size for variadic functions)
 + (static_chain_stack_bytes) + (frame_pointer when need)
  int saved_regs;       = (args.pretend_args_size for variadic functions) + (stack size for callee saved register) + ('''static_chain_stack_bytes''');
  int soft_frame;       /* FRAME_POINTER_REGNUM.  */ = (saved_regs) + (CALLER_INTERWORKING_SLOT_SIZE ??)
  int locals_base;      /* THUMB_HARD_FRAME_POINTER_REGNUM.  */ = (soft_frame) + (local variables size)
  int outgoing_args;    /* STACK_POINTER_REGNUM.  */ = (locals_base) + (outgoing_args)
  unsigned int saved_regs_mask;
}

ARM Change ARG_POINTER Alignment for DOUBLEWORD Alignment ABI

  • ARM use the following code to change argument pointer alignment
/* Do anything needed before RTL is emitted for each function.  */
void
arm_init_expanders (void)
{
  /* Arrange to initialize and mark the machine per-function status.  */
  init_machine_status = arm_init_machine_status;

  /* This is to stop the combine pass optimizing away the alignment
     adjustment of va_arg.  */
  if (cfun)
    mark_reg_pointer (arg_pointer_rtx, PARM_BOUNDARY); <= setting argument pointer alignment
}
  • Without these code, the GCC testsuite test case va-arg-26.c may get wrong
#include <stdarg.h>

double f (float f1, float f2, float f3, float f4,
          float f5, float f6, ...)
{
  va_list ap;
  double d;

  va_start (ap, f6);
  d = va_arg (ap, double);
  va_end (ap);
  return d;
}

int main ()
{
  if (f (1, 2, 3, 4, 5, 6, 7.0) != 7.0)
    abort ();
  exit (0);
}
  • va-arg-26.c.144r.expand
(insn 11 10 12 (set (reg:SI 134)
        (plus:SI (reg/f:SI 128 virtual-incoming-args)
            (const_int 4 [0x4]))) va-arg-26.c:9 -1
     (nil))

(insn 14 13 15 (set (reg:SI 149)
        (plus:SI (reg/f:SI 134 [ D.2028 ])
            (const_int 7 [0x7]))) va-arg-26.c:10 -1
     (nil))

(insn 15 14 0 (set (reg:SI 137 [ D.2030 ])
        (and:SI (reg:SI 149)
            (const_int -8 [0xfffffffffffffff8]))) va-arg-26.c:10 -1
     (nil))
  • For Double word alignment ABI
 GCC will emit (a + 7) & -8 to fit double word alignment
  • GCC could use the macro REGNO_POINTER_ALIGN
 to mark the register value in certain alignment
  • Registers alignment will initialize in emit-rtl.c
void
init_emit (void)
{
...
#ifdef STACK_BOUNDARY
  REGNO_POINTER_ALIGN (STACK_POINTER_REGNUM) = STACK_BOUNDARY;
  REGNO_POINTER_ALIGN (FRAME_POINTER_REGNUM) = STACK_BOUNDARY;
  REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = STACK_BOUNDARY;
  REGNO_POINTER_ALIGN (ARG_POINTER_REGNUM) = STACK_BOUNDARY;

  REGNO_POINTER_ALIGN (VIRTUAL_INCOMING_ARGS_REGNUM) = STACK_BOUNDARY;
  REGNO_POINTER_ALIGN (VIRTUAL_STACK_VARS_REGNUM) = STACK_BOUNDARY;
  REGNO_POINTER_ALIGN (VIRTUAL_STACK_DYNAMIC_REGNUM) = STACK_BOUNDARY;
  REGNO_POINTER_ALIGN (VIRTUAL_OUTGOING_ARGS_REGNUM) = STACK_BOUNDARY;
  REGNO_POINTER_ALIGN (VIRTUAL_CFA_REGNUM) = BITS_PER_WORD;
#endif

#ifdef INIT_EXPANDERS
  INIT_EXPANDERS;
#endif
}
  • ARM will set STACK_BOUNDARY to 64 for DOUBLEWORD alignment ABI
 Therefore, ARG_POINTER_REGNUM will set to 64 bit alignemnt.
  • in combine phase, combine.c:8156
 function  force_to_mode ()
    case PLUS:
      /* In (and (plus FOO C1) M), if M is a mask that just turns off
         low-order bits (as in an alignment operation) and FOO is already
         aligned to that boundary, mask C1 to that boundary as well.
         This may eliminate that PLUS and, later, the AND.  */

      {
        unsigned int width = GET_MODE_BITSIZE (mode);
        unsigned HOST_WIDE_INT smask = mask;

        /* If MODE is narrower than HOST_WIDE_INT and mask is a negative
           number, sign extend it.  */

        if (width < HOST_BITS_PER_WIDE_INT
            && (smask & ((unsigned HOST_WIDE_INT) 1 << (width - 1))) != 0)
          smask |= (unsigned HOST_WIDE_INT) (-1) << width;

        if (CONST_INT_P (XEXP (x, 1))
            && exact_log2 (- smask) >= 0
            && (nonzero_bits (XEXP (x, 0), mode) & ~smask) == 0
            && (INTVAL (XEXP (x, 1)) & ~smask) != 0)
          return force_to_mode (plus_constant (XEXP (x, 0),
                                               (INTVAL (XEXP (x, 1)) & smask)),
                                mode, smask, next_select);
      }
  • These code try to eliminate the constant C1
 (and (plus FOO C1) M)
  • When FOO already fit the alignment in M,
 then the lower bit in C1 could optimize out.

EX: (FOO + 11) & 0xfffffff8 could optimize to

   (FOO + 8) & 0xfffffff8 when FOO already in mask alignemnt (which is 8-byte alignemnt)
  • (nonzero_bits (XEXP (x, 0), mode) & ~smask) == 0 <== check Dose the FOO in mask alignment
  • (INTVAL (XEXP (x, 1)) & ~smask) != 0) <== check whether the C1 lower bits not zero
  • If ARG_POINTER_REGNUM is 64 bit alignemnt,
 (ARG_POINTER_REGNUM + 11) & (-8) will optimize 
 to (ARG_POINTER_REGNUM + 8) which not ARM really want.
  • Therefore, ARM use mark_reg_pointer (arg_pointer_rtx, PARM_BOUNDARY)
 Set arg_pointer to 32 bit alignemnt.

ARM GCC constant generation

  • constant_ok_for_arm: 可用一道指令设定出来的constant值
    1. 小数值 (8 bit可表示)
    2. 小数值 + SHIFT
    3. 小数值 + ROTATE
    4. 小数值 + OR (partial repeat)
  • const_ok_for_op:
    • 可minus或可inverted的operation, 转换后的constant是否符合 const_ok_for_arm
  • arm_split_constant
    • 由arm_gen_constant计算生成某constant的cost
 cost太高時split成move pair, 否則用arm_gen_constant 生成constant
  • arm_gen_constant:做constant拆解的分析和generate
    • quick check
      • IOR:
        1. a = b or 0xffffffff ==> a = 0xffffffff
        2. a = b or 0 ==> a = b
      • AND:
        1. a = b and 0 ==> a = 0
        2. a = b and 0xffffffff ==> a = b
      • XOR:
        1. a = b xor 0 ==> a = b
        2. a = b xor 0xffffffff ==> a = not (b)
      • MINUS
        1. a = 0 - b ==> a = neg(b)
      • 符合const_ok_for_arm的常数,emit出相对应的指令
    • more transformation
      • SET:
        1. set low_part
        2. move then shift
        3. move then add/sub
        4. move then or
      • IOR/XOR:
      • mvn support transformation
      • AND: shift transformation

Trampolines for Nested Functions

  • gcc testsuite testcase 20000822-1.c
#ifndef NO_TRAMPOLINES
int f0(int (*fn)(int *), int *p)
{
  return (*fn) (p);
}

int f1(void)
{
  int i = 0;

  int f2(int *p)
  {
    i = 1;
    return *p + 1;
  }

  return f0(f2, &i);
}
#endif

int main()
{
#ifndef NO_TRAMPOLINES
  if (f1() != 2)
    abort ();
#endif
  return 0;
}
  • arm assembly code
        .file   "20000822-1.c"
        .text
        .align  2
        .type   f2.1205, %function
f2.1205:
        @ Nested: function declared inside another function.
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        mov     r3, #1
        str     r3, [ip, #0]  <=== load static chain
        ldr     r0, [r0, #0]  <=== access variable of function f1 by static chain
        add     r0, r0, r3
        bx      lr
        .size   f2.1205, .-f2.1205
        .align  2
        .global f0
        .type   f0, %function
f0:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        mov     r3, r0
        str     lr, [sp, #-4]!
        mov     r0, r1
        mov     lr, pc
        bx      r3
        ldr     pc, [sp], #4
        .size   f0, .-f0
        .section        .rodata
        .align  2
.LTRAMP0:
        ldr     ip, [pc, #0]       <== load static chain value to ip
        ldr     pc, [pc, #0]       <== jump to function f2.1205
        .word   0
        .word   0
        .global __clear_cache
        .text
        .align  2
        .global f1
        .type   f1, %function
f1:
        @ args = 0, pretend = 0, frame = 20
        @ frame_needed = 0, uses_anonymous_args = 0
        stmfd   sp!, {r4, lr}
        ldr     r3, #.LTRAMP0
        sub     sp, sp, #20
        ldmia   r3, {r0, r1, r2, r3}   <== load trampolines code from .LTRAMP0
        add     r4, sp, #4
        stmia   r4, {r0, r1, r2, r3}   <== store trampolines code to stack
        add     r3, sp, #0
        str     r3, [r4, #8]  <== store static chain value to stack
        ldr     r3, #f2.1205
        mov     r0, r4
        add     r1, sp, #20
        str     r3, [r4, #12]  <== store function address to stack         
        bl      __clear_cache  <== sync icache content with memory
        mov     r3, #0
        str     r3, [sp, #0]
        mov     r0, sp
        mov     lr, pc
        bx      r4
        add     sp, sp, #20
        ldmfd   sp!, {r4, pc}
        .size   f1, .-f1
        .section        .text.startup,"ax",%progbits
        .align  2
        .global main
        .type   main, %function
main:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        str     lr, [sp, #-4]!
        bl      f1
        cmp     r0, #2
        blne    abort
.L5:
        mov     r0, #0
        ldr     pc, [sp], #4
        .size   main, .-main
        .ident  "GCC: (GNU) 4.6.2"
  • GCC porting macro for trampolines
/* To generate trampoline code in .LTRAMP0 */
#define TARGET_ASM_TRAMPOLINE_TEMPLATE arm_asm_trampoline_template
/* To emit trampoline initial code, including copy trampoline code to stack 
   and copy static chain and function address to stack */
#define TARGET_TRAMPOLINE_INIT arm_trampoline_init

/* Emit RTL insns to initialize the variable parts of a trampoline.  */

static void
arm_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
{
  rtx fnaddr, mem, a_tramp;

  emit_block_move (m_tramp, assemble_trampoline_template (),  <== copy trampoline code to stack
                   GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);

  mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 8 : 12);
  emit_move_insn (mem, chain_value);  <== store static chain value

  mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 12 : 16);
  fnaddr = XEXP (DECL_RTL (fndecl), 0);
  emit_move_insn (mem, fnaddr);  <== store function address

  a_tramp = XEXP (m_tramp, 0);
  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),  <== emit sync icache call
                     LCT_NORMAL, VOIDmode, 2, a_tramp, Pmode,
                     plus_constant (a_tramp, TRAMPOLINE_SIZE), Pmode);
  • arm define TARGET_TRAMPOLINE_ADJUST_ADDRESS to set mode to thumb mode for TARGET_THUMB
#define TARGET_TRAMPOLINE_ADJUST_ADDRESS arm_trampoline_adjust_address
/* Thumb trampolines should be entered in thumb mode, so set
   the bottom bit of the address.  */

static rtx
arm_trampoline_adjust_address (rtx addr)
{
  if (TARGET_THUMB)
    addr = expand_simple_binop (Pmode, IOR, addr, const1_rtx,
                                NULL, 0, OPTAB_LIB_WIDEN);
  return addr;
}
/* Length in units of the trampoline for entering a nested function.  */
#define TRAMPOLINE_SIZE  (TARGET_32BIT ? 16 : 20)
/* Alignment required for a trampoline in bits.  */
#define TRAMPOLINE_ALIGNMENT  32
/* the trampoline code alignment in data section and in run time stack */

Study of C++ ABI for Itanium: Exception handling

  • Personality routine
    • 找目前frame的handler, 并判断current frame找到的handler能不能处理现在catch到的exception
    • 用来处理 frame跳转 相关的设定
      • 例如exception object pointer 要如何传递, 设定landing pad.
    • 是language dependent, 因为不同的语言可能会设计不同的处理方式
      • g++定义在 libstdc++-v3/libsupc++/eh_personality.cc
  • 在unwind的过程中,会先decode FDE来找到目前frame的 personality routine,存在 再跳进去
  • Unwinding的原因有两种
    • exception 例如c++ support的
      • 由personality routine判断有没有handler可以处理
      • _Unwind_RaiseException perform exception style unwinding
    • force unwinding 例如 longjmp 或 thread termination
      • 不需要personality routine去判断,只想做直接的跳转 所以用_UA_FORCE_UNWIND来标记这件事
      • _Unwind_ForcedUnwind perform force unwinding
  • __attribute__(cleanup) is a GCC extension which will do unwinding

The Unwind Process

  1. Search Phase
    • unwinding然后call personality routine with _UA_SEARCH_PHASE flag
    • unwinding是不断的利用return address找出current frame的personaltiy routine,再判断有没有可以处理的handler存在, 在Search phase并不会真的改变unwind state
    • 当Search phase找到可以处理的handler时才会进入第二的phase,没找会call terminate().
  2. Cleanup Phase
    • unwind然后call personality routine with _UA_CLEARNUP_PHASE flag
    • 这次会真的改变register state然后跳转到 user landing pad code (user handler code)
  • Resumptive exception handling and Non-resumptive
    • Resumptive model: exception handle执行完之后,程式会从发生exception的点开始执行
    • Non-resumptive model: exception handler执行完之后,程式会由catch block的下个statement开始执行
    • C++是属于Non-resumptive
  • Two phase unwinding对于Non-resumptive其实不一定需要,但GCC为了兼容不同语言,可能是Resumptive or Non-resumptive而做了two phase的设计

Exception header

  • Exception structure in GCC, define in unwind-generic.h
 struct _Unwind_Exception
 86 {
 87   _Unwind_Exception_Class exception_class;
 88   _Unwind_Exception_Cleanup_Fn exception_cleanup;
 89   _Unwind_Word private_1;
 90   _Unwind_Word private_2;
 91
 92   /* @@@ The IA-64 ABI says that this structure must be double-word aligned.
 93      Taking that literally does not make much sense generically.  Instead we
 94      provide the maximum alignment required by any type for the machine.  */
 95 } __attribute__((__aligned__));
  • exception_class 和语言相关也和实作相关, 可以用来分辨是同个语言的exception或是不同语言的(native and foreign exceptions)
  • exception_cleanup 删除exception object
    • exception_cleanup 的其中一个参数是 _Unwind_Reason_Code reason, 用来描述为何要删除这个exception object
      • _URC_FOREIGN_EXCEPTION_CAUGHT = 1 一些foreign相关导致产生 未定义行为的情况
      • _URC_FATAL_PHASE1_ERROR = 3 personality routine在phase 1发生错误
      • _URC_FATAL_PHASE2_ERROR = 2 personality routine在phase 2发生错误

Unwind Context

  • _Unwind_Context structure用来记录frame的状态
    • _Unwind_Context for setjmp longjmp unwinding 定义在unwind-sjlj.c
    • _Unwind_Context for dwarf2 定义在 unwind-dw2.c
  58 /* This is the register and unwind state for a particular frame.  This
  59    provides the information necessary to unwind up past a frame and return
  60    to its caller.  */
  61 struct _Unwind_Context
  62 {
  63   void *reg[DWARF_FRAME_REGISTERS+1];
  64   void *cfa;
  65   void *ra;
  66   void *lsda;
  67   struct dwarf_eh_bases bases;
  68   /* Signal frame context.  */
  69 #define SIGNAL_FRAME_BIT ((~(_Unwind_Word) 0 >> 1) + 1)
  70   /* Context which has version/args_size/by_value fields.  */
  71 #define EXTENDED_CONTEXT_BIT ((~(_Unwind_Word) 0 >> 2) + 1)
  72   _Unwind_Word flags;
  73   /* 0 for now, can be increased when further fields are added to
  74      struct _Unwind_Context.  */
  75   _Unwind_Word version;
  76   _Unwind_Word args_size;
  77   char by_value[DWARF_FRAME_REGISTERS+1];
  78 };

Throwing an Exception

  • _Unwind_RaiseException define in unwind.inc
 79 /* Raise an exception, passing along the given exception object.  */
 80
 81 _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
 82 _Unwind_RaiseException(struct _Unwind_Exception *exc)
 83 {
 84   struct _Unwind_Context this_context, cur_context;
 85   _Unwind_Reason_Code code;
 86
 87   /* Set up this_context to describe the current stack frame.  */
 88   uw_init_context (&this_context);
 89   cur_context = this_context;
 90
 91   /* Phase 1: Search.  Unwind the stack, calling the personality routine
 92      with the _UA_SEARCH_PHASE flag set.  Do not modify the stack yet.  */
 93   while (1)
 94     {
 95       _Unwind_FrameState fs;
 96
 97       /* Set up fs to describe the FDE for the caller of cur_context.  The
 98          first time through the loop, that means __cxa_throw.  */
 99       code = uw_frame_state_for (&cur_context, &fs);
100
101       if (code == _URC_END_OF_STACK)
102         /* Hit end of stack with no handler found.  */
103         return _URC_END_OF_STACK;
104
105       if (code != _URC_NO_REASON)
106         /* Some error encountered.  Usually the unwinder doesn't
107            diagnose these and merely crashes.  */
108         return _URC_FATAL_PHASE1_ERROR;
109
110       /* Unwind successful.  Run the personality routine, if any.  */
111       if (fs.personality)
112         {
113           code = (*fs.personality) (1, _UA_SEARCH_PHASE, exc->exception_class,
114                                     exc, &cur_context);
115           if (code == _URC_HANDLER_FOUND)
116             break;
117           else if (code != _URC_CONTINUE_UNWIND)
118             return _URC_FATAL_PHASE1_ERROR;
119         }
120
121       /* Update cur_context to describe the same frame as fs.  */
122       uw_update_context (&cur_context, &fs);
123     }
124
125   /* Indicate to _Unwind_Resume and associated subroutines that this
126      is not a forced unwind.  Further, note where we found a handler.  */
127   exc->private_1 = 0;
128   exc->private_2 = uw_identify_context (&cur_context);
129
130   cur_context = this_context;
131   code = _Unwind_RaiseException_Phase2 (exc, &cur_context);
132   if (code != _URC_INSTALL_CONTEXT)
133     return code;
134
135   uw_install_context (&this_context, &cur_context);
136 }
  • _Unwind_RaiseException 只有在发生错误时 会return reason code
    • _Unwind_Reason_Code 可能的值
      • _URC_END_OF_STACK: unwinding到最后一个stack,而且找不到可以处理的handler
        • 在这样的case下 The C++ runtime最终会call uncaught_exception()来处理
      • URC_FATAL_PHASE1_ERROR: 在phase1发生非预期的错误 例如 stack corrupt
        • 在这样的情况下最终会call terminate()
  • unwinding发生后 很有可能会改变stack或register的状态,所以"caller of _Unwind_RaiseException"不能对caller目前的state有所假设
  • _Unwind_ForcedUnwind define in unwind.inc
192 /* Raise an exception for forced unwinding.  */
193
194 _Unwind_Reason_Code LIBGCC2_UNWIND_ATTRIBUTE
195 _Unwind_ForcedUnwind (struct _Unwind_Exception *exc,
196                       _Unwind_Stop_Fn stop, void * stop_argument)
197 {
198   struct _Unwind_Context this_context, cur_context;
199   _Unwind_Reason_Code code;
200
201   uw_init_context (&this_context);
202   cur_context = this_context;
203
204   exc->private_1 = (_Unwind_Ptr) stop;
205   exc->private_2 = (_Unwind_Ptr) stop_argument;
206
207   code = _Unwind_ForcedUnwind_Phase2 (exc, &cur_context);
208   if (code != _URC_INSTALL_CONTEXT)
209     return code;
210
211   uw_install_context (&this_context, &cur_context);
212 }
  • Forced unwinding is a single-phase process
    • 真正的unwinding发生在 _Unwind_ForcedUnwind_Phase2
  • Unwinding结束,是由stopstop_argument决定,而不是由 personality routine
  • 当达到停止条件之后 会call _Unwind_DeleteException 然后跳进user code, 否则会return reason code
  • _Unwind_ForcedUnwind 可能会return的 _Unwind_Reason_Code
    • _URC_NO_REASON: 不是想要到达的frame会在call这个frame的personality routine with action flag _UA_FORCE_UNWIND_UA_CLEANUP_PHASE 然后unwind到下个frame在call一次stop function.
    • _URC_END_OF_STACK: 当unwind到最后一个frame都没法满足 stop function时return这个code, 这时context内的sp会设成NULL
    • _URC_FATAL_PHASE2_ERROR: stop function可能会return这个code当发生stack corruption
  • Example:用Force unwinding 实作longjmp_unwind()
    • Setjmp()储存现在的state包含frame pointer的值
    • longjmp_unwind() call _Unwind_ForcedUnwind, stop function设定为unwind到frame pointer的值和Setjmp存的frame pointer 值相同时停止
    • 停止后把state还原为Setjmp()储存的state, 否则就return _URC_NO_REASON 继续unwind
  • _Unwind_Resume define in unwind.inc
215 /* Resume propagation of an existing exception.  This is used after
216    e.g. executing cleanup code, and not to implement rethrowing.  */
217
218 void LIBGCC2_UNWIND_ATTRIBUTE
219 _Unwind_Resume (struct _Unwind_Exception *exc)
220 {
221   struct _Unwind_Context this_context, cur_context;
222   _Unwind_Reason_Code code;
223
224   uw_init_context (&this_context);
225   cur_context = this_context;
226
227   /* Choose between continuing to process _Unwind_RaiseException
228      or _Unwind_ForcedUnwind.  */
229   if (exc->private_1 == 0)
230     code = _Unwind_RaiseException_Phase2 (exc, &cur_context);
231   else
232     code = _Unwind_ForcedUnwind_Phase2 (exc, &cur_context);
233
234   gcc_assert (code == _URC_INSTALL_CONTEXT);
235
236   uw_install_context (&this_context, &cur_context);
237 }
  • _Unwind_Resume是唯一一个user code可以直接用的function, 可以写在user landing pad code的最后 继续做unwinding
    • 这和rethrow不同, rethrow的实作必须在call一次 _Unwind_RaiseException

Context Management

  • 在GCC里有dwarf2和setjmp/longjmp的version
    • 分别定义在unwind-dw2.c和unwind-sjlj.c
  • _Unwind_GetGR:取得context内某个gpr register的value
  • _Unwind_SetGR:设定context内某个gpr register的value
  • _Unwind_GetIP:取得context的return value
  • _Unwind_SetIP:设定context的return value
  • _Unwind_GetLanguageSpecificData: returns the address of the "language-specific data area"(LSDA) for the current stack frame.
  • _Unwind_GetRegionStart: This routine returns the address of the beginning of the procedure or code fragment described by the current unwind descriptor block.

Personality Routine

  • The personality routine is the interface between "system unwind library" and "language-specific exception handling semantic"
    _Unwind_Reason_Code (*__personality_routine)
	    (int version,
	     _Unwind_Action actions,
	     uint64 exceptionClass,
	     struct _Unwind_Exception *exceptionObject,
	     struct _Unwind_Context *context);
  • personality routine的参数
    • version: 用来侦测unwinder和personality routine有没有match
    • action: 描述personality routine的状态 是一个bit mask
    • exceptionClass: 8byte的大小用来描述execption type
      • high 4byte 描述vendor 例如 HP
      • low 4 byte 描述语言 例如 C++
    • context: current frame context (Unwind_Context)
  • Personality routine actions
    • _UA_SEARCH_PHASE: 搜寻目前的frame有没有可以处理目前exception的handler
      • 找到handler回传 URC_HANDLER_FOUND 没找到回传 _URC_CONTINUE_UNWIND
    • _UA_CLEANUP_PHASE: 目前的frame要做cleanup的动作,会利用SET_IP设定ra到landing pad然后return code _URC_INSTALL_CONTEXT
    • _UA_HANDLER_FRAME: 这用在phase2, 用来标记现在的frame就是在phase1找到handler的那个frame.
    • _UA_FORCE_UNWIND: 摽记目前是做force unwind, 不需要personalty routine判断可不可以处理这个excetpion.

C++ Exception object create and destroy

C++ Exception objects

  • C++ exception object structure defined in gcc_folder/libstdc++-v3/libsupc++/unwind-cxx.h
 45 // A primary C++ exception object consists of a header, which is a wrapper
 46 // around an unwind object header with additional C++ specific information,
 47 // followed by the exception object itself.
 48
 49 struct __cxa_exception
 50 {
 51   // Manage the exception object itself.
 52   std::type_info *exceptionType;
 53   void (*exceptionDestructor)(void *);
 54
 55   // The C++ standard has entertaining rules wrt calling set_terminate
 56   // and set_unexpected in the middle of the exception cleanup process.
 57   std::unexpected_handler unexpectedHandler;
 58   std::terminate_handler terminateHandler;
 59
 60   // The caught exception stack threads through here.
 61   __cxa_exception *nextException;
 62
 63   // How many nested handlers have caught this exception.  A negated
 64   // value is a signal that this object has been rethrown.
 65   int handlerCount;
 66
 67 #ifdef __ARM_EABI_UNWINDER__
 68   // Stack of exceptions in cleanups.
 69   __cxa_exception* nextPropagatingException;
 70
 71   // The nuber of active cleanup handlers for this exception.
 72   int propagationCount;
 73 #else
 74   // Cache parsed handler data from the personality routine Phase 1
 75   // for Phase 2 and __cxa_call_unexpected.
 76   int handlerSwitchValue;
 77   const unsigned char *actionRecord;
 78   const unsigned char *languageSpecificData;
 79   _Unwind_Ptr catchTemp;
 80   void *adjustedPtr;
 81 #endif
 82
 83   // The generic exception header.  Must be last.
 84   _Unwind_Exception unwindHeader;
 85 };
  • Caught Exception Stack
    • define in gcc_folder/libstdc++-v3/libsupc++/unwind-cxx.h
136 // Each thread in a C++ program has access to a __cxa_eh_globals object.
137 struct __cxa_eh_globals
138 {
139   __cxa_exception *caughtExceptions;
140   unsigned int uncaughtExceptions;
141 #ifdef __ARM_EABI_UNWINDER__
142   __cxa_exception* propagatingExceptions;
143 #endif
144 };
  • caughtExceptions:
    • a global exception link list for each thread
    • the most recent first
    • linked through nextException of exception header

Throwing an Exception

  • call __cxa_allocate_exception to create exception object and then call __cxa_throw to throw it
  • __cxa_allocate_exception define in gcc_folder/libstdc++-v3/libsupc++/eh_alloc.cc
 96 extern "C" void *
 97 __cxxabiv1::__cxa_allocate_exception(std::size_t thrown_size) throw()
 98 {
 99   void *ret;
100
101   thrown_size += sizeof (__cxa_refcounted_exception);
102   ret = malloc (thrown_size);
103
104   if (! ret)
105     {
106       __gnu_cxx::__scoped_lock sentry(emergency_mutex);
107
108       bitmask_type used = emergency_used;
109       unsigned int which = 0;
110
111       if (thrown_size > EMERGENCY_OBJ_SIZE)
112         goto failed;
113       while (used & 1)
114         {
115           used >>= 1;
116           if (++which >= EMERGENCY_OBJ_COUNT)
117             goto failed;
118         }
119
120       emergency_used |= (bitmask_type)1 << which;
121       ret = &emergency_buffer[which][0];
122
123     failed:;
124
125       if (!ret)
126         std::terminate ();
127     }
128
129   // We have an uncaught exception as soon as we allocate memory.  This
130   // yields uncaught_exception() true during the copy-constructor that
131   // initializes the exception object.  See Issue 475.
132   __cxa_eh_globals *globals = __cxa_get_globals ();
133   globals->uncaughtExceptions += 1;
134
135   memset (ret, 0, sizeof (__cxa_refcounted_exception));
136
137   return (void *)((char *)ret + sizeof (__cxa_refcounted_exception));
138 }
  • Exception object must be thread safe, therefore normally be allocated in the heap

Exception Handlers

  • exception handler一定会呼叫到 __cxa_begin_catch__cxa_end_catch
  • __cxa_begin_catch define in gcc_folder/libstdc++-v3/libsupc++/eh_catch.cc
 39 extern "C" void *
 40 __cxxabiv1::__cxa_begin_catch (void *exc_obj_in) throw()
 41 {
 42   _Unwind_Exception *exceptionObject
 43     = reinterpret_cast <_Unwind_Exception *>(exc_obj_in);
 44   __cxa_eh_globals *globals = __cxa_get_globals ();
 45   __cxa_exception *prev = globals->caughtExceptions;
 46   __cxa_exception *header = __get_exception_header_from_ue (exceptionObject);
 47   void* objectp;
 .....
  • __cxa_begin_catch会做
    • exception handler count +1
    • 把exception加进global的caught exception
    • uncaught exception count -1
 30 extern "C" void *
 31 __cxxabiv1::__cxa_get_exception_ptr(void *exc_obj_in) throw()
 32 {
 33   _Unwind_Exception *exceptionObject
 34     = reinterpret_cast <_Unwind_Exception *>(exc_obj_in);
 35
 36   return __gxx_caught_object(exceptionObject);
 37 }
  • __cxa_get_exception_ptr回传adjust pointer, 当catch的参数不复杂时 不会呼叫
  • __cxa_end_catch define in gcc_folder/libstdc++-v3/libsupc++/eh_catch.cc
 90 extern "C" void
 91 __cxxabiv1::__cxa_end_catch ()
 92 {
 93   __cxa_eh_globals *globals = __cxa_get_globals_fast ();
 94   __cxa_exception *header = globals->caughtExceptions;
 95
 96   // A rethrow of a foreign exception will be removed from the
 97   // the exception stack immediately by __cxa_rethrow.
 98   if (!header)
 99     return;
 .....
  • handler count of the exception-1
  • 当handler count为0时 把exception 从caught exception stack中移除
  • Destroy the exception if the handler count == 0

Exception handling Porting in GCC

  • http://luse.blogspot.tw/2010/01/c.html
  • C++ ABI for Itanium: Exception handling
  • sjlj vs dw2 in GCC
    • http://stackoverflow.com/questions/318383/exception-handling-models-of-gcc
    • sjlj的作法 是把current context都存在current stack中然后呼叫 _Unwind_SjLj_Register 把著些context chain起来, unwind install 时在把register 设为handler在的context
    • sjlj为了纪录所有frame的context, 每call 一个function 就会呼叫一次_Unwind_SjLj_Register, performance上可能会比较差, 因为每call个frame就要储存所有的资讯
    • dw2 unwinding则是利用存在.eh_frame sectino里的CIE和FDE判断 unwinding时需要设回哪些register不需要每个frame都在执行一次store context的动作,所以 performance应该会比较好.可是为了储存.eh_frame的资讯,codesize也会大一些
/* Use which register to pass data to exception handling library routines */
/* Also need save these register in prolog */
#define EH_RETURN_DATA_REGNO(N)
/* The data would be a exception object pointer which will be initial in PERSONALITY_FUNCTION */
/* Use which register to adjust $sp before leave _Unwind_Resume */
/* Need emit the sp adjust in exception handler epilog */
#define EH_RETURN_STACKADJ_RTX
Implement EH_RETURN_HANDLER_RTX or eh_return pattern
to modify return address then jump to exception handler when return from exception handling library routines
  • When we write throw in C++ program, the assembly will generate following calls
    • __cxa_allocate_exception
      • create exception object
      • source code in: gcc_folder/libstdc++-v3/libsupc++/eh_alloc.cc
    • __cxa_throw
      • create exception header and then call _Unwind_RaiseException or _Unwind_SjLj_RaiseException
      • source code in: gcc_folder/libstdc++-v3/libsupc++/eh_throw.cc
    • _Unwind_Resume
      • Resume propagation of an existing exception
      • source code in : gcc_folder/gcc/unwind.inc which will include to unwind-sjlj.c and unwind-dw2.c
  • Definition of FDE and CIE in unwind-dw2-fde.h
 /* Terminology:
   CIE - Common Information Element
   FDE - Frame Descriptor Element

   There is one per function, and it describes where the function code
   is located, and what the register lifetimes and stack layout are
   within the function.

   The data structures are defined in the DWARF specification, although
   not in a very readable way (see LITERATURE).

   Every time an exception is thrown, the code needs to locate the FDE
   for the current function, and starts to look for exception regions
   from that FDE. This works in a two-level search:
   a) in a linear search, find the shared image (i.e. DLL) containing
      the PC
   b) using the FDE table for that shared object, locate the FDE using
      binary search (which requires the sorting).  */
  • _Unwind_RaiseException source in unwind.inc
    • Phase1 find the handler without change stack (with flag _UA_SEARCH_PHASE)
      • call uw_frame_state_for (struct _Unwind_Context *context, _Unwind_FrameState *fs)
        • According to context->ra to find the caller frame state
          • will call _Unwind_Find_FDE (context->ra, ..) find FDE from dwarf2 object and then decode FDE and the update to fs structure
      • Unwinding until find handler
  • _Unwind_RaiseException_Phase2 source in unwind.inc
    • Phase2 unwind and change stack to pass exception object to handler
      • Phase2 will call (*fs.personality) with _UA_CLEANUP_PHASE flag
      • which will update context to pass exception object
      • PERSONALITY_FUNCTION in libstdc++-v3/libsupc++/eh_personality.cc
 ...
  _Unwind_SetGR (context, __builtin_eh_return_data_regno (0),
                 (_Unwind_Ptr) ue_header);
  _Unwind_SetGR (context, __builtin_eh_return_data_regno (1), 0);
  _Unwind_SetIP (context, landing_pad);
  return _URC_INSTALL_CONTEXT;
}
  • call uw_install_context in _Unwind_RaiseException_Phase2 to change control to handler
/* Install TARGET into CURRENT so that we can return to it.  This is a
   macro because __builtin_eh_return must be invoked in the context of
   our caller.  */

#define uw_install_context(CURRENT, TARGET)                             \
  do                                                                    \
    {                                                                   \
      long offset = uw_install_context_1 ((CURRENT), (TARGET));         \
      void *handler = __builtin_frob_return_addr ((TARGET)->ra);        \
      _Unwind_DebugHook ((TARGET)->cfa, handler);                       \
      __builtin_eh_return (offset, handler);                            \
    }                                                                   \
  while (0)
  • uw_install_context_1: recover handler registers and calculate sp offset
    • sp offset will set to EH_RETURN_STACKADJ_RTX
  • __builtin_frob_return_addr: get handler address
  • _builtin_eh_return: modify return address change control to exception hanlder
  • __builtin_eh_return in ARM
arm_set_return_address (rtx source, rtx scratch)
{
  arm_stack_offsets *offsets;
  HOST_WIDE_INT delta;
  rtx addr;
  unsigned long saved_regs;

  offsets = arm_get_frame_offsets ();
  saved_regs = offsets->saved_regs_mask;

  if ((saved_regs & (1 << LR_REGNUM)) == 0) <== if return address in LR_REGNUM, emit a move to change return address to handler
    emit_move_insn (gen_rtx_REG (Pmode, LR_REGNUM), source);
  else <== if return address in memory, change return address to handler
    {  
      if (frame_pointer_needed)
        addr = plus_constant(hard_frame_pointer_rtx, -4);
      else
        {
          /* LR will be the first saved register.  */
          delta = offsets->outgoing_args - (offsets->frame + 4);

          if (delta >= 4096)
            {
              emit_insn (gen_addsi3 (scratch, stack_pointer_rtx,
                                     GEN_INT (delta & ~4095)));
              addr = scratch;
              delta &= 4095;
            }
          else
            addr = stack_pointer_rtx;

          addr = plus_constant (addr, delta);
        }
      emit_move_insn (gen_frame_mem (Pmode, addr), source);
    }
}

Use GCC plugin to add a new optimization pass

  • A plugin registers a new pass with GCC by calling register_callback with the PLUGIN_PASS_MANAGER_SETUP event and a pointer to a struct register_pass_info object defined as follows
  • define in tree-pass.h
enum pass_positioning_ops
{
PASS_POS_INSERT_AFTER, // Insert after the reference pass.
PASS_POS_INSERT_BEFORE, // Insert before the reference pass.
PASS_POS_REPLACE // Replace the reference pass.
};
struct register_pass_info
{
  struct opt_pass *pass;            /* New pass to register.  */
  const char *reference_pass_name;  /* Name of the reference pass for hooking
                                       up the new pass.  */
  int ref_pass_instance_number;     /* Insert the pass at the specified
                                       instance number of the reference pass.
                                       Do it for every instance if it is 0.  */
  enum pass_positioning_ops pos_op; /* how to insert the new pass.  */
};
/* Optimization pass type.  */
enum opt_pass_type
{
  GIMPLE_PASS,
  RTL_PASS,
  SIMPLE_IPA_PASS,
  IPA_PASS
};
/* Describe one pass; this is the common part shared across different pass
   types.  */
struct opt_pass
{
  /* Optimization pass type.  */
  enum opt_pass_type type;

  /* Terse name of the pass used as a fragment of the dump file
     name.  If the name starts with a star, no dump happens. */
  const char *name;

  /* If non-null, this pass and all sub-passes are executed only if
     the function returns true.  */
  bool (*gate) (void);

  /* This is the code to run.  If null, then there should be sub-passes
     otherwise this pass does nothing.  The return value contains
     TODOs to execute in addition to those in TODO_flags_finish.   */
  unsigned int (*execute) (void);

  /* A list of sub-passes to run, dependent on gate predicate.  */
  struct opt_pass *sub;

  /* Next in the list of passes to run, independent of gate predicate.  */
  struct opt_pass *next;

  /* Static pass number, used as a fragment of the dump file name.  */
  int static_pass_number;
  /* The timevar id associated with this pass.  */
  /* ??? Ideally would be dynamically assigned.  */
  timevar_id_t tv_id;

  /* Sets of properties input and output from this pass.  */
  unsigned int properties_required;
  unsigned int properties_provided;
  unsigned int properties_destroyed;

  /* Flags indicating common sets things to do before and after.  */
  unsigned int todo_flags_start;
  unsigned int todo_flags_finish;
};
/* Sample plugin code that registers a new pass. */
int
plugin_init (struct plugin_name_args *plugin_info,
struct plugin_gcc_version *version)
{
struct register_pass_info pass_info;
...
/* Code to fill in the pass_info object with new pass information. */
...
/* Register the new pass. */
register_callback (plugin_info->base_name, PLUGIN_PASS_MANAGER_SETUP, NULL, &pass_info);
...
}

Inline rules in GCC

  • according the inline relative parameter default setting in params.def
    • all instruction number means internal gcc instruction numbers, not assembly instruction numbers
# The function marked inline and the method defined in class if insn numbers < 400
# when -finline-functions (-O3) is enable, the function insns < 40 will auto inlined
# The maximum number of instructions inline function can grow to via recursive inlining is 450
# The maximum number of instructions non-inline function can grow to via recursive inlining is 450
# The maximum depth of recursive inlining for inline functions is 8
# The maximum depth of recursive inlining for non-inline functions is 8
# Inline recursively only when the probability of call being executed exceeds the parameter is 10 
# The maximum number of nested indirect inlining performed by early inliner is 10
# The size of function body to be considered large is 2700
# Maximal growth due to inlining of large function (in percent) 100
# The size of translation unit to be considered large 10000
# How much can given compilation unit grow because of the inlining (in percent) is 30
# How much can given compilation unit grow because of the interprocedural constant propagation (in percent) 10
# Maximal estimated growth of function body caused by early inlining of single call is 10
# The size of stack frame to be considered large is 256
# Maximal stack frame growth due to inlining (in percent) is 1000

RTX cost in GCC

  • /home/cychen/source/462/gcc/optabs.c
    • in avoid_expensive_constant,
  set rtx_cost to guide gcc generate constant in operation (binoptab->code) 
  or just force the constant to register.
static rtx
avoid_expensive_constant (enum machine_mode mode, optab binoptab,
                          rtx x, bool unsignedp)
{
  bool speed = optimize_insn_for_speed_p ();

  if (mode != VOIDmode
      && optimize
      && CONSTANT_P (x)
      && rtx_cost (x, binoptab->code, speed) > rtx_cost (x, SET, speed))
    {
      if (CONST_INT_P (x))
        {
          HOST_WIDE_INT intval = trunc_int_for_mode (INTVAL (x), mode);
          if (intval != INTVAL (x))
            x = GEN_INT (intval);
        }
      else
        x = convert_modes (mode, VOIDmode, x, unsignedp);
      x = force_reg (mode, x);
    }
  return x;
}
  • gcc_source/gcc/loop-invariant.c
    • In create_new_invariant
  GCC will use address cost compare with magic number 3 to determine the insn is cheap or not.
  The cheap address will no accumulate the rtx cost of the insn.
  Then the insn may not move out of the loop.
static struct invariant *
create_new_invariant (struct def *def, rtx insn, bitmap depends_on,
                      bool always_executed)
{
  struct invariant *inv = XNEW (struct invariant);
  rtx set = single_set (insn);
  bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn));

  inv->def = def;
  inv->always_executed = always_executed;
  inv->depends_on = depends_on;

  /* If the set is simple, usually by moving it we move the whole store out of
     the loop.  Otherwise we save only cost of the computation.  */
  if (def) <== When the Insn will set value to register, def will be true.
    {
      inv->cost = rtx_cost (set, SET, speed);
      /* ??? Try to determine cheapness of address computation.  Unfortunately
         the address cost is only a relative measure, we can't really compare
         it with any absolute number, but only with other address costs.
         But here we don't have any other addresses, so compare with a magic
         number anyway.  It has to be large enough to not regress PR33928
         (by avoiding to move reg+8,reg+16,reg+24 invariants), but small
         enough to not regress 410.bwaves either (by still moving reg+reg
         invariants).
         See http://gcc.gnu.org/ml/gcc-patches/2009-10/msg01210.html .  */
      inv->cheap_address = address_cost (SET_SRC (set), word_mode,
                                         ADDR_SPACE_GENERIC, speed) < 3;
    }
  else
    {
      inv->cost = rtx_cost (SET_SRC (set), SET, speed);
      inv->cheap_address = false;
    }
   ..
  • gcc_source/gcc/fwprop.c
    • In try_fwprop_subst
/* Try substituting NEW into LOC, which originated from forward propagation
   of USE's value from DEF_INSN.  SET_REG_EQUAL says whether we are
   substituting the whole SET_SRC, so we can set a REG_EQUAL note if the
   new insn is not recognized.  Return whether the substitution was
   performed.  */

static bool
try_fwprop_subst (df_ref use, rtx *loc, rtx new_rtx, rtx def_insn, bool set_reg_equal)
{
  rtx insn = DF_REF_INSN (use);
  rtx set = single_set (insn);
  rtx note = NULL_RTX;
  bool speed = optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn));
  int old_cost = 0;
  bool ok;

  update_df_init (def_insn, insn);

  /* forward_propagate_subreg may be operating on an instruction with
     multiple sets.  If so, assume the cost of the new instruction is
     not greater than the old one.  */
  if (set)
    old_cost = rtx_cost (SET_SRC (set), SET, speed);
...
  validate_unshare_change (insn, loc, new_rtx, true);
  if (!verify_changes (0))
    {
      if (dump_file)
        fprintf (dump_file, "Changes to insn %d not recognized\n",
                 INSN_UID (insn));
      ok = false;
    }

  else if (DF_REF_TYPE (use) == DF_REF_REG_USE
           && set
           && rtx_cost (SET_SRC (set), SET, speed) > old_cost)
    {
      if (dump_file)
        fprintf (dump_file, "Changes to insn %d not profitable\n",
                 INSN_UID (insn));
      ok = false;
    }
...
  • gcc_source/gcc/fwprop.c
    • In should_replace_address
static bool
should_replace_address (rtx old_rtx, rtx new_rtx, enum machine_mode mode,
                        addr_space_t as, bool speed)
{
  int gain;

  if (rtx_equal_p (old_rtx, new_rtx)
      || !memory_address_addr_space_p (mode, new_rtx, as))
    return false;

  /* Copy propagation is always ok.  */
  if (REG_P (old_rtx) && REG_P (new_rtx))
    return true;

  /* Prefer the new address if it is less expensive.  */
  gain = (address_cost (old_rtx, mode, as, speed)
          - address_cost (new_rtx, mode, as, speed));

  /* If the addresses have equivalent cost, prefer the new address
     if it has the highest `rtx_cost'.  That has the potential of
     eliminating the most insns without additional costs, and it
     is the same that cse.c used to do.  */
  if (gain == 0)
    gain = rtx_cost (new_rtx, SET, speed) - rtx_cost (old_rtx, SET, speed);

  return (gain > 0);
}

Loop Invariant Code Motion in GCC

  • Move loop invariant out of loop may have performance improvement
  • However, it may increase the register pressure and produce spill code.
  • Therefore, GCC need a way to estimate register pressure increment for Loop Invariant Code Motion (LICM).
  • GCC have two way to estimate register pressure for LICM.
    • IRA way register pressure estimation
      • Use IRA based register pressure calculation in RTL loop optimizations.
      • Use -fira-loop-pressure to enable
      • Scan the register pressure increment for each cover class.
    • Non-IRA way register pressure estimation
  • gain_for_invariant in loop-invariant.c
    • Get computation cost(comp_cost) from rtx_cost which will simulate performance gain
    • Get size cost(size_cost) from IRA way or non-IRA way which will simulate size increment from spill code
    • The function will return gain = comp_cost - size_cost
      • Non-positive gains will not do the code motion.
    • Use two array to record register pressure increment
      • new_regs: The register increment of all loop invariant except current one (accumulate)
      • regs_needed: The register increment of current loop invariant (current one)
  • Non-IRA way will call "estimate_reg_pressure_cost" which define in cfgloopanal.c
/* Estimates cost of increased register pressure caused by making N_NEW new
   registers live around the loop.  N_OLD is the number of registers live
   around the loop.  If CALL_P is true, also take into account that
   call-used registers may be clobbered in the loop body, reducing the
   number of available registers before we spill.  */

unsigned
estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed,
                            bool call_p)
{
  unsigned cost;
  unsigned regs_needed = n_new + n_old;
  unsigned available_regs = target_avail_regs;

  /* If there is a call in the loop body, the call-clobbered registers
     are not available for loop invariants.  */
  if (call_p)
    available_regs = available_regs - target_clobbered_regs;

  /* If we have enough registers, we should use them and not restrict
     the transformations unnecessarily.  */
  if (regs_needed + target_res_regs <= available_regs)
    return 0;

  if (regs_needed <= available_regs)
    /* If we are close to running out of registers, try to preserve
       them.  */
    cost = target_reg_cost [speed] * n_new;
  else
    /* If we run out of registers, it is very expensive to add another
       one.  */
    cost = target_spill_cost [speed] * n_new;

  if (optimize && (flag_ira_region == IRA_REGION_ALL
                   || flag_ira_region == IRA_REGION_MIXED)
      && number_of_loops () <= (unsigned) IRA_MAX_LOOPS_NUM)
    /* IRA regional allocation deals with high register pressure
       better.  So decrease the cost (to do more accurate the cost
       calculation for IRA, we need to know how many registers lives
       through the loop transparently).  */
    cost /= 2;

  return cost;
}
  • Setting target_reg_cost and target_spill_cost in init_set_costs
for (speed = 0; speed < 2; speed++)
     {
      crtl->maybe_hot_insn_p = speed;
      /* Set up the costs for using extra registers:

         1) If not many free registers remain, we should prefer having an
            additional move to decreasing the number of available registers.
            (TARGET_REG_COST).
         2) If no registers are available, we need to spill, which may require
            storing the old value to memory and loading it back
            (TARGET_SPILL_COST).  */

      start_sequence ();
      emit_move_insn (reg1, reg2);
      seq = get_insns ();
      end_sequence ();
      target_reg_cost [speed] = seq_cost (seq, speed);

      start_sequence ();
      emit_move_insn (mem, reg1);
      emit_move_insn (reg2, mem);
      seq = get_insns ();
      end_sequence ();
      target_spill_cost [speed] = seq_cost (seq, speed);
    }

  • IRA way to estimate register pressure
      for (i = 0; i < ira_reg_class_cover_size; i++)
        {
          cover_class = ira_reg_class_cover[i];
          if ((int) new_regs[cover_class]
              + (int) regs_needed[cover_class]
              + LOOP_DATA (curr_loop)->max_reg_pressure[cover_class]
              + IRA_LOOP_RESERVED_REGS
              > ira_available_class_regs[cover_class])
            break;
        }
      if (i < ira_reg_class_cover_size)
        /* There will be register pressure excess and we want not to
           make this loop invariant motion.  All loop invariants with
           non-positive gains will be rejected in function
           find_invariants_to_move.  Therefore we return the negative
           number here.

           One could think that this rejects also expensive loop
           invariant motions and this will hurt code performance.
           However numerous experiments with different heuristics
           taking invariant cost into account did not confirm this
           assumption.  There are possible explanations for this
           result:
           o probably all expensive invariants were already moved out
             of the loop by PRE and gimple invariant motion pass.
           o expensive invariant execution will be hidden by insn
             scheduling or OOO processor hardware because usually such
             invariants have a lot of freedom to be executed
             out-of-order.
           Another reason for ignoring invariant cost vs spilling cost
           heuristics is also in difficulties to evaluate accurately
           spill cost at this stage.  */
        return -1;
      else
        size_cost = 0;
  • IRA_LOOP_RESERVED_REGS define in params.def
DEFPARAM (PARAM_IRA_LOOP_RESERVED_REGS,
          "ira-loop-reserved-regs",
          "The number of registers in each class kept unused by loop invariant motion",
          2, 0, 0)

Data speculation and control speculation in Itanium

GCC instruction scheduling

  • GCC have two phase to do instruction scheduling
    • one before register allocation and may do interblock scheduling
    • one after register allocation and the region will be single basic block
  • Scheduling main entry function schedule_insns in sched-rgn.c
/* The one entry point in this file.  */
3299 void
3300 schedule_insns (void)
3301 {
3302   int rgn;
3303
3304   /* Taking care of this degenerate case makes the rest of
3305      this code simpler.  */
3306   if (n_basic_blocks == NUM_FIXED_BLOCKS)
3307     return;
3308
3309   rgn_setup_common_sched_info ();
3310   rgn_setup_sched_infos ();
3311
3312   haifa_sched_init ();
3313   sched_rgn_init (reload_completed);
3314
3315   bitmap_initialize (&not_in_df, 0);
3316   bitmap_clear (&not_in_df);
3317
3318   /* Schedule every region in the subroutine.  */
3319   for (rgn = 0; rgn < nr_regions; rgn++)
3320     if (dbg_cnt (sched_region))
3321       schedule_region (rgn);
3322
3323   /* Clean up.  */
3324   sched_rgn_finish ();
3325   bitmap_clear (&not_in_df);
3326
3327   haifa_sched_finish ();
3328 }
  • The sched1 pass before register allocation may do interblock scheduling
    • before do interblock scheduling will call find_rgns() in sched-rgn.c
1068 /* Wrapper function.
1069    If FLAG_SEL_SCHED_PIPELINING is set, then use custom function to form
1070    regions.  Otherwise just call find_rgns_haifa.  */
1071 static void
1072 find_rgns (void)
1073 {
1074   if (sel_sched_p () && flag_sel_sched_pipelining)
1075     sel_find_rgns ();
1076   else
1077     haifa_find_rgns ();
1078 }
  • haifa_find_rgns () will initial the region
    • Identify inner loop as region
      • perform DFS search to find back edge (find loop)
      • Use dominate info to identify nature loop (loop header dominate all loop body blocks)
    • when the flag max-sched-extend-regions-iters is not zero
      • perform extend regions
        • The idea is to topologically walk through CFG in top-down order.
          • if all the predecessors of a node are marked to be in the same region then current node is also marked to be a part of that region.
          • Otherwise the node starts its own region
    • Other basic blocks not in any region will form to a region as single basic block.
  • After form regions, call schedule_region () do region scheduling
    • call compute_block_dependences(int bb) compute dependences inside bb
      • call sched_analyze () in compute_block_dependences ()
        • call deps_analyze_insn() in sched_analyze ()
          • For jump instruction set memory barrier for scheduling
            • memory access instruction won't cross jump instructions
          • For call instructions
            • If is a SETJMP instruction, Assume that all registers, may be clobbered by this call
            • clobber caller save register
          • call sched_analyze_insn ()
            • Analyze an INSN with pattern X to find all dependencies.
  • Compute priority for each insn in the region
2897 void
2898 compute_priorities (void)
2899 {
2900   int bb;
2901
2902   current_sched_info->sched_max_insns_priority = 0;
2903   for (bb = 0; bb < current_nr_blocks; bb++)
2904     {
2905       rtx head, tail;
2906
2907       gcc_assert (EBB_FIRST_BB (bb) == EBB_LAST_BB (bb));
2908       get_ebb_head_tail (EBB_FIRST_BB (bb), EBB_LAST_BB (bb), &head, &tail);
2909
2910       if (no_real_insns_p (head, tail))
2911         continue;
2912
2913       rgn_n_insns += set_priorities (head, tail);
2914     }
2915   current_sched_info->sched_max_insns_priority++;
2916 }
  • set_priorities will call priority (rtx insn)
    • the priority of each insn is max (cost + priority_of_each_consumer)
      • cost is the cycle latency from the insn issue and results
        • depend on scheduling porting
    • Therefore, the following insn will have high priority
      • long latency insn
      • have high priority consumer
      • lots of successor insn
 224 /* An instruction is ready to be scheduled when all insns preceding it
 225    have already been scheduled.  It is important to ensure that all
 226    insns which use its result will not be executed until its result
 227    has been computed.  An insn is maintained in one of four structures:
 228
 229    (P) the "Pending" set of insns which cannot be scheduled until
 230    their dependencies have been satisfied.
 231    (Q) the "Queued" set of insns that can be scheduled when sufficient
 232    time has passed.
 233    (R) the "Ready" list of unscheduled, uncommitted insns.
 234    (S) the "Scheduled" list of insns.
 235
 236    Initially, all insns are either "Pending" or "Ready" depending on
 237    whether their dependencies are satisfied.
 238
 239    Insns move from the "Ready" list to the "Scheduled" list as they
 240    are committed to the schedule.  As this occurs, the insns in the
 241    "Pending" list have their dependencies satisfied and move to either
 242    the "Ready" list or the "Queued" set depending on whether
 243    sufficient time has passed to make them ready.  As time passes,
 244    insns move from the "Queued" set to the "Ready" list.
 245
 246    The "Pending" list (P) are the insns in the INSN_FORW_DEPS of the
 247    unscheduled insns, i.e., those that are ready, queued, and pending.
 248    The "Queued" set (Q) is implemented by the variable `insn_queue'.
 249    The "Ready" list (R) is implemented by the variables `ready' and
 250    `n_ready'.
 251    The "Scheduled" list (S) is the new insn chain built by this pass.
 252
 253    The transition (R->S) is implemented in the scheduling loop in
 254    `schedule_block' when the best insn to schedule is chosen.
 255    The transitions (P->R and P->Q) are implemented in `schedule_insn' as
 256    insns move from the ready list to the scheduled list.
 257    The transition (Q->R) is implemented in 'queue_to_insn' as time
 258    passes or stalls are introduced.  */

Phase ordering Issues of Register Allocation and Scheduling

  • Register allocation and scheduling have conflict goal
    • Register allocation: Minimize spill code, create sequential code for register reuse
    • Scheduler: Try to fill all parallel unit and make the register pressure higher
  • Instruction Scheduling followed by Register Allocation followed by post-scheduling
    • GCC use the order sched1->RA->sched2
    • schedule before RA have more opportunity to find ILP
      • could perform renaming to resolve output/anti dependency therefore have wilder range to schedule
      • may increase register usage
    • post-scheduling: scheduling again after generate spill code
    • won't increase register usage
  • Current GCC also have cooperative approaches
    • a register pressure sensitive instruction scheduling before RA
    • Use flag -fsched-pressure to enable it

Data flow initialization in GCC

  • in phase .dfinit
    • rest_of_handle_df_initialize () in df-core.c
      1. Add problems
        1. df_scan_add_problem()
        2. df_lr_add_problem()
        3. if (optimize > 1) df_live_add_problem()
      2. initialize df_d data structure
      3. df_hard_reg_init () : setup eliminate registers
      4. df_compute_register_ever_live (true)
        • compute regs_ever_live
        • parameter determine reset or not
      5. df_scan_blocks() : scan all of the blocks in block_to_analyze
        • scan all blocks in function if block_to_analyze is NULL
        • df_get_regular_block_artificial_uses
          • regular_block: non-entry, non-exit basic block
          • artificial use:
            1. frame pointer
            2. hardware frame pointer
            3. argument pointer
            4. PIC_OFFSET_TABLE_REGNUM
            5. stack pointer
            6. EH_RETURN_DATA_REGNO
  • data flow problems (df-problems.c)
    1. reaching definition
    2. upward exposed uses
    3. live variables
      1. LR problem
      2. UR problem
      3. the intersection of LR and UR
    4. un-initialized variables
    5. def-use chain
    6. use-def chain
  • Example of using dataflow routines
    1. df_*_add_problem (flags)
      • add a problem
      • a problem could depend on other problem
        • DU-chain or UD-chain is dependent on solving reaching definition
    2. df_set_blocks (blocks)
      • setup the region of dataflow analysis
      • default scan entire function without the df_set_blocks
    3. df_analyze (): solve the problems
    4. df_dump(stderr) : dump debug log
    5. df_finish_pass() : call df_remove_problem()
  • Data structure of dataflow relative
    1. reg-def: all the define of the register
    2. reg-use: all the use of the register
    3. insn-def: all the define of the insn
    4. insn-use: all the use of the insn
    5. def-use: all the use of the define
    6. use-def: all the define of the use
  • df insn info kept in an array DF_INSN_INFO
    • index by insn uid
  • Each insn has 3 list
    1. defs list: access by DF_INSN_INFO_DEFS
    2. uses list: access by DF_INSN_INFO_USES
    3. equal list: REG_EQUAL
  • Each insn has logical uid (LUID) store in DF_INSN_INFO
    • LUID only correct after call df_analyze ()
  • artificial defs may exist in basic block end to mark always live
    • Ex: stack pointer

LMA/VMA

  • LMA:load address
  • VMA:virtual address
  • GNU linker provide the feature for the code
    • run on ROM and need to access global data (data section)
    • However, ROM only can read
    • Therefore, linker provide the feature to access data section in VMA (RAM)
    • Which means data section in image would place on LMA but global data access would base on VMA
    • User need to copy data section from LMA(ROM image) to VMA(RAM) in initial code.
  • Reference: http://www.delorie.com/gnu/docs/binutils/ld_33.html

LLVM Note

  • Modify configure files for new target
    • configure
    • autoconf/config.sub
    • projects/sample/autoconf/config.sub
    • include/llvm/ADT/Triple.h
    • lib/Support/Triple.cpp
  • Register Relative class
    • include/llvm/Target/Target.td
      • SubRegIndex
        • identify sub-register
      • Register
        • specify register name and sub-reg relation
      • RegisterClass
        • to specify register class for instruction operand constraint including
          • regType
          • alignment
          • register-list (register allocation order)
          • alternative (register allocation) order

Aarch64 Frame Stack

        +-------------------------------+
        |                               |
        |  incoming stack arguments     |
        |                               |
        +-------------------------------+ <-- arg_pointer_rtx  \
        |                               |                       \
        |  callee-allocated save area   |
        |  for register varargs         |
        |                               |
        +-------------------------------+                     original_frame_size
        |                               |
        |  local variables              |
        |                               |                       /
        +-------------------------------+ <-- frame_pointer_rtx/
        |                               |
        |  callee-saved registers       |
        |                               |
        +-------------------------------+
        |  LR'                          |
        +-------------------------------+
        |  FP'                          |
      P +-------------------------------+ <-- hard_frame_pointer_rtx
        |  dynamic allocation           |
        +-------------------------------+
        |                               |
        |  outgoing stack arguments     |
        |                               |
        +-------------------------------+ <-- stack_pointer_rtx

   Dynamic stack allocations such as alloca insert data at point P.
   They decrease stack_pointer_rtx but leave frame_pointer_rtx and
   hard_frame_pointer_rtx unchanged.  */

  /* sub sp, sp, #<frame_size>
     stp {fp, lr}, [sp, #<frame_size> - 16]
     add fp, sp, #<frame_size> - hardfp_offset
     stp {cs_reg}, [fp, #-16] etc.

     sub sp, sp, <final_adjustment_if_any>
  */

Prologue Generation Note

  • aarch64_expand_prologue (void)
/* The reason for following code

   is to split the sp adjust timing
   
   which could make the store pair code-gen for callee save 
   
   could meet the store pair offset constraint. */


  offset = frame_size = total frame size;

  fp_offset = (offset
               - original_frame_size
               - cfun->machine->frame.saved_regs_size); 

   /* limit to 512 because it's load/store pair offset limit. */
   if (offset < 512)
     {
	if (frame_pointer_needed)
	  {
            if (fp_offset > 0)
	      {
		1) adjust sp (total frame_size)
		2) stp x29, x30, [sp, 0]
              }
            else
              {
                1) stp     x29, x30, [sp, -48(adjust sp)]!
              }
            3) initial FP value;
	  }
        else
          {
            1) adjust sp (total frame_size)
          }

        5) store pair for callee save registers 
           call aarch64_save_or_restore_callee_save_registers     
      }
    else
      {
        fp_offset = 0;
        offset = original_frame_size + cfun->machine->frame.saved_regs_size;
      
        if (offset >= 512)
          offset = cfun->machine->frame.saved_regs_size;
      
        frame_size -= (offset + crtl->outgoing_args_size);
        
        
        if (split could make offset < 512)
          {
            split frame to 2 part:
		------------
		|  offset  |
		------------
		| outgoing |
		------------
            1) generate offset relative code like if (offset <512) above
            2) generate sp adjust for outgoing
            
          }
        else
          {
            split frame to 3 part:
		------------
		|frame_size|
		------------
		|  offset  |
		------------
		| outgoing |
		------------
            1) generate frame_size adjust code
		  
               if (frame_size >= 0x1000000)
		 {
		   a. set frame_size to r16(IP0)
		   b. adjust sp by sub r16
                 }
                else
                 {
                   a. sub sp ,{frame_size & ~(0xfff)}
		   b. sub sp ,{frame_size & 0xfff}

		   the benifit to split high/low part of sp adjust:
			each part have chance execute by one sub.
                 }
	    2) generate offset part like if (offset <512) above
            3) generate sp adjust for outgoing
            
          }
      }

Epilogue Generation Note

  • aarch64_expand_epilogue (bool for_sibcall)
  if (offset >= 512)
    {
       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
         {
	   sp = sp + {outgoing_args_size}
         }
    }

  if (frame_pointer_needed
      && (crtl->outgoing_args_size || cfun->calls_alloca))
    {
       sp = fp - fp_offset
       cfa_reg = stack_pointer_rtx;
       (it seems for helping debug message generation)
    }

  /* generate callee save registers pop */
  aarch64_save_or_restore_callee_save_registers
    (fp_offset + cfun->machine->frame.hardfp_offset, 1);

   if (offset > 0)
     {
	if (frame_pointer_needed)
	  {
            if (fp_offset > 0)
	      {
		1) ldp x29, x30, [fp, fp_offset]
		2) sp = sp + offset
              }
            else
              {
                1) ldp x29, x30, [sp, offset]!
		2) add_reg_note of the 1) insn as REG_CFA_ADJUST_CFA
              }
	  }
        else
          {
            1) sp = sp + offset
          }
      }

  /* Stack adjustment for exception handler.  */
  if (crtl->calls_eh_return)
    {
      /* We need to unwind the stack by the offset computed by
         EH_RETURN_STACKADJ_RTX.  However, at this point the CFA is
         based on SP.  Ideally we would update the SP and define the
         CFA along the lines of:

         SP = SP + EH_RETURN_STACKADJ_RTX
         (regnote CFA = SP - EH_RETURN_STACKADJ_RTX)

         However the dwarf emitter only understands a constant
         register offset.

         The solution chosen here is to use the otherwise unused IP0
         as a temporary register to hold the current SP value.  The
         CFA is described using IP0 then SP is modified.  */
       
         insn = emit_move_insn (IP0, SP);
         add_reg_note of insn as REG_CFA_DEF_CFA
         RTX_FRAME_RELATED_P (insn) = 1;

	 sp = sp + EH_RETURN_STACKADJ_RTX
     }


 if (the frame split to 3 part)
   {
		------------
		|frame_size|
		------------
		|  offset  |
		------------
		| outgoing |
		------------

     in this time outgoing and offset have been adjust

     generate frame_size adjust code
		  
               if (frame_size >= 0x1000000)
		 {
		   a. set frame_size to r16(IP0)
		   b. adjust sp by add r16
                 }
                else
                 {
                   a. add sp ,{frame_size & ~(0xfff)}
		   b. add sp ,{frame_size & 0xfff}

		   the benifit to split high/low part of sp adjust:
			each part have chance execute by one add.
                 }
    }

  emit_use (LR)

  if (!for_sibcall)
    emit ret;

Probability Theory and Statistics Note

About me