Source-Changes-HG archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
[src/trunk]: src/gnu/dist/toolchain/gcc/config/arm Changes to better support ...
details: https://anonhg.NetBSD.org/src/rev/55dc241124ae
branches: trunk
changeset: 535523:55dc241124ae
user: thorpej <thorpej%NetBSD.org@localhost>
date: Tue Aug 20 23:46:44 2002 +0000
description:
Changes to better support XScale, round 1, back-ported from GCC 3.2.
The GCC ChangeLog does not have a complete description to quote here,
so:
arm.c:
* arm_override_options(): Set arm_is_scale according to the the
-mcpu=xscale option. Set arm_constant_limit to 2 if arm_is_xscale.
* arm_adjust_cost(): If arm_is_xscale, account for stalls that can
occur due to shifted operands.
* arm_gen_load_multiple(): Account for the cost of ldm vs. ldr if
arm_is_xscale.
* arm_gen_store_multiple(): Likewise for stm vs. str.
arm.h:
* CONSTANT_ALIGNMENT(): Use a constant alignment factor of 2 if
arm_is_xscale.
* MOVE_RATIO: Set to 4 if arm_is_xscale.
arm.md:
* Add XScale scheduling parameters.
* Define a "shift" attribute (used by arm_adjust_cost()) and give it
to the appropriate operands on andsi_not_shiftsi_si, *shiftsi3,
*shiftsi3_compare0, *shiftsi3_compare0_scratch, *notsi_shiftsi,
*notsi_shiftsi_compare0, *not_shiftsi_compare0_scratch,
abssi2, *neg_abssi2, extendsidi2, *cmpsi_shiftsi, *cmpsi_shiftsi_swp,
*cmpsi_neg_shiftsi, *arith_shiftsi, *arith_shiftsi_compare0,
*arith_shiftsi_compare0_scratch, *sub_shiftsi, *sub_shiftsi_compare0,
*sub_shiftsi_compare0_scratch, *if_shift_move, *if_move_shift,
and *if_shift_shift.
diffstat:
gnu/dist/toolchain/gcc/config/arm/arm.c | 129 ++++++++++++++++++++++++++++++-
gnu/dist/toolchain/gcc/config/arm/arm.h | 15 ++-
gnu/dist/toolchain/gcc/config/arm/arm.md | 108 +++++++++++++++++++++----
3 files changed, 231 insertions(+), 21 deletions(-)
diffs (truncated from 519 to 300 lines):
diff -r 56b54675101a -r 55dc241124ae gnu/dist/toolchain/gcc/config/arm/arm.c
--- a/gnu/dist/toolchain/gcc/config/arm/arm.c Tue Aug 20 23:02:44 2002 +0000
+++ b/gnu/dist/toolchain/gcc/config/arm/arm.c Tue Aug 20 23:46:44 2002 +0000
@@ -103,6 +103,7 @@
#define FL_THUMB 0x20 /* Thumb aware */
#define FL_LDSCHED 0x40 /* Load scheduling necessary */
#define FL_STRONG 0x80 /* StrongARM */
+#define FL_XSCALE 0x100 /* XScale */
/* The bits in this mask specify which instructions we are allowed to generate. */
static int insn_flags = 0;
@@ -127,6 +128,9 @@
/* Nonzero if this chip is a StrongARM. */
int arm_is_strong = 0;
+/* Nonzero if this chip is an XScale. */
+int arm_is_xscale = 0;
+
/* Nonzero if this chip is a an ARM6 or an ARM7. */
int arm_is_6_or_7 = 0;
@@ -235,7 +239,7 @@
--thorpej%netbsd.org@localhost */
{"arm10tdmi", FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_THUMB | FL_LDSCHED },
{"arm1020t", FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_THUMB | FL_LDSCHED },
- {"xscale", FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_LDSCHED | FL_STRONG },
+ {"xscale", FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_LDSCHED | FL_STRONG | FL_XSCALE },
{NULL, 0}
};
@@ -523,6 +527,7 @@
/* Initialise boolean versions of the flags, for use in the arm.md file. */
arm_fast_multiply = (insn_flags & FL_FAST_MULT) != 0;
arm_arch4 = (insn_flags & FL_ARCH4) != 0;
+ arm_is_xscale = (insn_flags & FL_XSCALE) != 0;
arm_ld_sched = (tune_flags & FL_LDSCHED) != 0;
arm_is_strong = (tune_flags & FL_STRONG) != 0;
@@ -574,6 +579,9 @@
to load a constant, and the load scheduler may well reduce that to 1. */
if (optimize_size || (tune_flags & FL_LDSCHED))
arm_constant_limit = 1;
+
+ if (arm_is_xscale)
+ arm_constant_limit = 2;
/* If optimizing for size, bump the number of instructions that we
are prepared to conditionally execute (even on a StrongARM).
@@ -1867,6 +1875,47 @@
{
rtx i_pat, d_pat;
+ /* Some true dependencies can have a higher cost depending
+ on precisely how certain input operands are used. */
+ if (arm_is_xscale
+ && REG_NOTE_KIND (link) == 0
+ && recog_memoized (insn) < 0
+ && recog_memoized (dep) < 0)
+ {
+ int shift_opnum = get_attr_shift (insn);
+ enum attr_type attr_type = get_attr_type (dep);
+
+ /* If nonzero, SHIFT_OPNUM contains the operand number of a shifted
+ operand for INSN. If we have a shifted input operand and the
+ instruction we depend on is another ALU instruction, then we may
+ have to account for an additional stall. */
+ if (shift_opnum != 0 && attr_type == TYPE_NORMAL)
+ {
+ rtx shifted_operand;
+ int opno;
+
+ /* Get the shifted operand. */
+ extract_insn (insn);
+ shifted_operand = recog_operand[shift_opnum];
+
+ /* Iterate over all the operands in DEP. If we write an operand
+ that overlaps with SHIFTED_OPERAND, then we have increate the
+ cost of this dependency. */
+ extract_insn (dep);
+ preprocess_constraints ();
+ for (opno = 0; opno < recog_n_operands; opno++)
+ {
+ /* We can ignore strict inputs. */
+ if (recog_op_type[opno] == OP_IN)
+ continue;
+
+ if (reg_overlap_mentioned_p (recog_operand[opno],
+ shifted_operand))
+ return 2;
+ }
+ }
+ }
+
/* XXX This is not strictly true for the FPA. */
if (REG_NOTE_KIND(link) == REG_DEP_ANTI
|| REG_NOTE_KIND(link) == REG_DEP_OUTPUT)
@@ -3164,6 +3213,58 @@
int sign = up ? 1 : -1;
rtx mem;
+ /* XScale has load-store double instructions, but they have stricter
+ alignment requirements than load-store multiple, so we can not
+ use them.
+
+ For XScale ldm requires 2 + NREGS cycles to complete and blocks
+ the pipeline until completion.
+
+ NREGS CYCLES
+ 1 3
+ 2 4
+ 3 5
+ 4 6
+
+ an ldr instruction takes 1-3 cycles, but does not block the
+ pipeline.
+
+ NREGS CYCLES
+ 1 1-3
+ 2 2-6
+ 3 3-9
+ 4 4-12
+
+ Best case ldr will always win. However, the more ldr instructions
+ we issue, the less likely we are to be able to schedule them well.
+ Using ldr instructions also increases code size.
+
+ As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
+ for counts of 3 or 4 regs. */
+ if (arm_is_xscale && count <= 2 && ! optimize_size)
+ {
+ rtx seq;
+
+ start_sequence ();
+
+ for (i = 0; i < count; i++)
+ {
+ mem = gen_rtx_MEM (SImode, plus_constant (from, i * 4 * sign));
+ RTX_UNCHANGING_P (mem) = unchanging_p;
+ MEM_IN_STRUCT_P (mem) = in_struct_p;
+ MEM_SCALAR_P (mem) = scalar_p;
+ emit_move_insn (gen_rtx_REG (SImode, base_regno + i), mem);
+ }
+
+ if (write_back)
+ emit_move_insn (from, plus_constant (from, count * 4 * sign));
+
+ seq = gen_sequence ();
+ end_sequence ();
+
+ return seq;
+ }
+
result = gen_rtx_PARALLEL (VOIDmode,
rtvec_alloc (count + (write_back ? 2 : 0)));
if (write_back)
@@ -3208,6 +3309,32 @@
int sign = up ? 1 : -1;
rtx mem;
+ /* See arm_gen_load_multiple for discussion of
+ the pros/cons of ldm/stm usage for XScale. */
+ if (arm_is_xscale && count <= 2 && ! optimize_size)
+ {
+ rtx seq;
+
+ start_sequence ();
+
+ for (i = 0; i < count; i++)
+ {
+ mem = gen_rtx_MEM (SImode, plus_constant (to, i * 4 * sign));
+ RTX_UNCHANGING_P (mem) = unchanging_p;
+ MEM_IN_STRUCT_P (mem) = in_struct_p;
+ MEM_SCALAR_P (mem) = scalar_p;
+ emit_move_insn (mem, gen_rtx_REG (SImode, base_regno + i));
+ }
+
+ if (write_back)
+ emit_move_insn (to, plus_constant (to, count * 4 * sign));
+
+ seq = gen_sequence ();
+ end_sequence ();
+
+ return seq;
+ }
+
result = gen_rtx_PARALLEL (VOIDmode,
rtvec_alloc (count + (write_back ? 2 : 0)));
if (write_back)
diff -r 56b54675101a -r 55dc241124ae gnu/dist/toolchain/gcc/config/arm/arm.h
--- a/gnu/dist/toolchain/gcc/config/arm/arm.h Tue Aug 20 23:02:44 2002 +0000
+++ b/gnu/dist/toolchain/gcc/config/arm/arm.h Tue Aug 20 23:46:44 2002 +0000
@@ -477,6 +477,9 @@
/* Nonzero if this chip is a StrongARM. */
extern int arm_is_strong;
+/* Nonzero if this chip is an XScale. */
+extern int arm_is_xscale;
+
/* Nonzero if this chip is a an ARM6 or an ARM7. */
extern int arm_is_6_or_7;
@@ -614,9 +617,12 @@
#define BIGGEST_ALIGNMENT 32
/* Make strings word-aligned so strcpy from constants will be faster. */
-#define CONSTANT_ALIGNMENT(EXP, ALIGN) \
- (TREE_CODE (EXP) == STRING_CST \
- && (ALIGN) < BITS_PER_WORD ? BITS_PER_WORD : (ALIGN))
+#define CONSTANT_ALIGNMENT_FACTOR (! arm_is_xscale ? 1 : 2)
+
+#define CONSTANT_ALIGNMENT(EXP, ALIGN) \
+ ((TREE_CODE (EXP) == STRING_CST \
+ && (ALIGN) < BITS_PER_WORD * CONSTANT_ALIGNMENT_FACTOR) \
+ ? BITS_PER_WORD * CONSTANT_ALIGNMENT_FACTOR : (ALIGN))
/* Every structures size must be a multiple of 32 bits. */
/* This is for compatibility with ARMCC. ARM SDT Reference Manual
@@ -1703,6 +1709,9 @@
in one reasonably fast instruction. */
#define MOVE_MAX 4
+#undef MOVE_RATIO
+#define MOVE_RATIO (arm_is_xscale ? 4 : 2)
+
/* Define if operations between registers always perform the operation
on the full register even if a narrower mode is specified. */
#define WORD_REGISTER_OPERATIONS
diff -r 56b54675101a -r 55dc241124ae gnu/dist/toolchain/gcc/config/arm/arm.md
--- a/gnu/dist/toolchain/gcc/config/arm/arm.md Tue Aug 20 23:02:44 2002 +0000
+++ b/gnu/dist/toolchain/gcc/config/arm/arm.md Tue Aug 20 23:46:44 2002 +0000
@@ -48,6 +48,11 @@
(define_attr "is_strongarm" "no,yes" (const (symbol_ref "arm_is_strong")))
+;; Operand number of an input operand that is shifted. Zoer if the
+;; given instruction does not shift one of its input operands.
+(define_attr "is_xscale" "no,yes" (const (symbol_ref "arm_is_xscale")))
+(define_attr "shift" "" (const_int 0))
+
; Floating Point Unit. If we only have floating point emulation, then there
; is no point in scheduling the floating point insns. (Well, for best
; performance we should try and group them together).
@@ -238,12 +243,26 @@
;; Core unit
;;--------------------------------------------------------------------
;; Everything must spend at least one cycle in the core unit
+(define_function_unit "core" 1 0 (eq_attr "core_cycles" "single") 1 1)
+
(define_function_unit "core" 1 0
(and (eq_attr "ldsched" "yes") (eq_attr "type" "store1")) 1 1)
(define_function_unit "core" 1 0
(and (eq_attr "ldsched" "yes") (eq_attr "type" "load")) 2 1)
+;; We do not need to conditionalize the define_function_unit immediately
+;; above. This one will be ignored for anything other than xscale
+;; compiles and for xscale compiles it provides a larger delay
+;; and the scheduler will DTRT.
+;; FIXME: this test need to be revamped to not depend on this feature
+;; of the scheduler.
+
+(define_function_unit "core" 1 0
+ (and (and (eq_attr "ldsched" "yes") (eq_attr "type" "load"))
+ (eq_attr "is_xscale" "yes"))
+ 3 1)
+
(define_function_unit "core" 1 0
(and (eq_attr "ldsched" "!yes") (eq_attr "type" "load,store1")) 2 2)
@@ -275,6 +294,10 @@
(define_function_unit "core" 1 0 (eq_attr "type" "store3") 4 4)
(define_function_unit "core" 1 0 (eq_attr "type" "store4") 5 5)
+
+(define_function_unit "core" 1 0
+ (and (eq_attr "core_cycles" "multi")
+ (eq_attr "type" "!mult,load,store1,store2,store3,store4")) 32 32)
;; Note: For DImode insns, there is normally no reason why operands should
;; not be in the same register, what we don't want is for something being
@@ -1410,7 +1433,9 @@
(match_operand:SI 3 "arm_rhs_operand" "rM")]))
(match_operand:SI 1 "s_register_operand" "r")))]
""
- "bic%?\\t%0, %1, %2%S4")
+ "bic%?\\t%0, %1, %2%S4"
+ [(set_attr "shift" "2")]
+)
(define_insn "*andsi_notsi_si_compare0"
[(set (reg:CC_NOOV 24)
@@ -1783,7 +1808,9 @@
[(match_operand:SI 1 "s_register_operand" "r")
(match_operand:SI 2 "reg_or_int_operand" "rM")]))]
""
- "mov%?\\t%0, %1%S3")
+ "mov%?\\t%0, %1%S3"
+ [(set_attr "shift" "1")]
+)
Home |
Main Index |
Thread Index |
Old Index