[src/trunk]: src/gnu/dist/toolchain/gcc/config/arm Changes to better support ...

To: source-changes-hg%NetBSD.org@localhost
Subject: [src/trunk]: src/gnu/dist/toolchain/gcc/config/arm Changes to better support ...
From: thorpej <thorpej%NetBSD.org@localhost>
Date: Fri, 24 Jan 2020 11:53:08 +0000
details:   https://anonhg.NetBSD.org/src/rev/55dc241124ae
branches:  trunk
changeset: 535523:55dc241124ae
user:      thorpej <thorpej%NetBSD.org@localhost>
date:      Tue Aug 20 23:46:44 2002 +0000

description:
Changes to better support XScale, round 1, back-ported from GCC 3.2.
The GCC ChangeLog does not have a complete description to quote here,
so:

arm.c:

* arm_override_options(): Set arm_is_scale according to the the
  -mcpu=xscale option.  Set arm_constant_limit to 2 if arm_is_xscale.
* arm_adjust_cost(): If arm_is_xscale, account for stalls that can
  occur due to shifted operands.
* arm_gen_load_multiple(): Account for the cost of ldm vs. ldr if
  arm_is_xscale.
* arm_gen_store_multiple(): Likewise for stm vs. str.

arm.h:

* CONSTANT_ALIGNMENT(): Use a constant alignment factor of 2 if
  arm_is_xscale.
* MOVE_RATIO: Set to 4 if arm_is_xscale.

arm.md:

* Add XScale scheduling parameters.
* Define a "shift" attribute (used by arm_adjust_cost()) and give it
  to the appropriate operands on andsi_not_shiftsi_si, *shiftsi3,
  *shiftsi3_compare0, *shiftsi3_compare0_scratch, *notsi_shiftsi,
  *notsi_shiftsi_compare0, *not_shiftsi_compare0_scratch,
  abssi2, *neg_abssi2, extendsidi2, *cmpsi_shiftsi, *cmpsi_shiftsi_swp,
  *cmpsi_neg_shiftsi, *arith_shiftsi, *arith_shiftsi_compare0,
  *arith_shiftsi_compare0_scratch, *sub_shiftsi, *sub_shiftsi_compare0,
  *sub_shiftsi_compare0_scratch, *if_shift_move, *if_move_shift,
  and *if_shift_shift.

diffstat:

 gnu/dist/toolchain/gcc/config/arm/arm.c  |  129 ++++++++++++++++++++++++++++++-
 gnu/dist/toolchain/gcc/config/arm/arm.h  |   15 ++-
 gnu/dist/toolchain/gcc/config/arm/arm.md |  108 +++++++++++++++++++++----
 3 files changed, 231 insertions(+), 21 deletions(-)

diffs (truncated from 519 to 300 lines):

diff -r 56b54675101a -r 55dc241124ae gnu/dist/toolchain/gcc/config/arm/arm.c
--- a/gnu/dist/toolchain/gcc/config/arm/arm.c   Tue Aug 20 23:02:44 2002 +0000
+++ b/gnu/dist/toolchain/gcc/config/arm/arm.c   Tue Aug 20 23:46:44 2002 +0000
@@ -103,6 +103,7 @@
 #define FL_THUMB      0x20            /* Thumb aware */
 #define FL_LDSCHED    0x40           /* Load scheduling necessary */
 #define FL_STRONG     0x80           /* StrongARM */
+#define FL_XSCALE     0x100           /* XScale */
 
 /* The bits in this mask specify which instructions we are allowed to generate.  */
 static int insn_flags = 0;
@@ -127,6 +128,9 @@
 /* Nonzero if this chip is a StrongARM.  */
 int arm_is_strong = 0;
 
+/* Nonzero if this chip is an XScale.  */
+int arm_is_xscale = 0;
+
 /* Nonzero if this chip is a an ARM6 or an ARM7.  */
 int arm_is_6_or_7 = 0;
 
@@ -235,7 +239,7 @@
      --thorpej%netbsd.org@localhost  */
   {"arm10tdmi",                                 FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_THUMB | FL_LDSCHED },
   {"arm1020t",                          FL_MODE32 | FL_FAST_MULT | FL_ARCH4 | FL_THUMB | FL_LDSCHED },
-  {"xscale",                            FL_MODE32 | FL_FAST_MULT | FL_ARCH4 |            FL_LDSCHED | FL_STRONG },
+  {"xscale",                            FL_MODE32 | FL_FAST_MULT | FL_ARCH4 |            FL_LDSCHED | FL_STRONG | FL_XSCALE },
   
   {NULL, 0}
 };
@@ -523,6 +527,7 @@
   /* Initialise boolean versions of the flags, for use in the arm.md file.  */
   arm_fast_multiply = (insn_flags & FL_FAST_MULT) != 0;
   arm_arch4         = (insn_flags & FL_ARCH4) != 0;
+  arm_is_xscale     = (insn_flags & FL_XSCALE) != 0;
   
   arm_ld_sched      = (tune_flags & FL_LDSCHED) != 0;
   arm_is_strong     = (tune_flags & FL_STRONG) != 0;
@@ -574,6 +579,9 @@
      to load a constant, and the load scheduler may well reduce that to 1.  */
   if (optimize_size || (tune_flags & FL_LDSCHED))
     arm_constant_limit = 1;
+
+  if (arm_is_xscale)
+    arm_constant_limit = 2;
   
   /* If optimizing for size, bump the number of instructions that we
      are prepared to conditionally execute (even on a StrongARM). 
@@ -1867,6 +1875,47 @@
 {
   rtx i_pat, d_pat;
 
+  /* Some true dependencies can have a higher cost depending
+     on precisely how certain input operands are used.  */
+  if (arm_is_xscale
+      && REG_NOTE_KIND (link) == 0
+      && recog_memoized (insn) < 0
+      && recog_memoized (dep) < 0)
+    {
+      int shift_opnum = get_attr_shift (insn);
+      enum attr_type attr_type = get_attr_type (dep);
+
+      /* If nonzero, SHIFT_OPNUM contains the operand number of a shifted
+        operand for INSN.  If we have a shifted input operand and the
+        instruction we depend on is another ALU instruction, then we may
+        have to account for an additional stall.  */
+      if (shift_opnum != 0 && attr_type == TYPE_NORMAL)
+       {
+         rtx shifted_operand;
+         int opno;
+
+         /* Get the shifted operand.  */
+         extract_insn (insn);
+         shifted_operand = recog_operand[shift_opnum];
+
+         /* Iterate over all the operands in DEP.  If we write an operand
+            that overlaps with SHIFTED_OPERAND, then we have increate the
+            cost of this dependency.  */
+         extract_insn (dep);
+         preprocess_constraints ();
+         for (opno = 0; opno < recog_n_operands; opno++)
+           {
+             /* We can ignore strict inputs.  */
+             if (recog_op_type[opno] == OP_IN)
+               continue;
+
+             if (reg_overlap_mentioned_p (recog_operand[opno],
+                                          shifted_operand))
+               return 2;
+           }
+       }
+    }
+
   /* XXX This is not strictly true for the FPA. */
   if (REG_NOTE_KIND(link) == REG_DEP_ANTI
       || REG_NOTE_KIND(link) == REG_DEP_OUTPUT)
@@ -3164,6 +3213,58 @@
   int sign = up ? 1 : -1;
   rtx mem;
 
+  /* XScale has load-store double instructions, but they have stricter
+     alignment requirements than load-store multiple, so we can not
+     use them.
+
+     For XScale ldm requires 2 + NREGS cycles to complete and blocks
+     the pipeline until completion.
+
+       NREGS           CYCLES
+         1               3
+         2               4
+         3               5
+         4               6
+     
+     an ldr instruction takes 1-3 cycles, but does not block the
+     pipeline.
+
+       NREGS           CYCLES
+         1              1-3
+         2              2-6
+         3              3-9
+         4              4-12
+
+     Best case ldr will always win.  However, the more ldr instructions
+     we issue, the less likely we are to be able to schedule them well.
+     Using ldr instructions also increases code size.
+
+     As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
+     for counts of 3 or 4 regs.  */
+  if (arm_is_xscale && count <= 2 && ! optimize_size)
+    {
+      rtx seq;
+
+      start_sequence ();
+
+      for (i = 0; i < count; i++)
+       {
+         mem = gen_rtx_MEM (SImode, plus_constant (from, i * 4 * sign));
+         RTX_UNCHANGING_P (mem) = unchanging_p;
+         MEM_IN_STRUCT_P (mem) = in_struct_p;
+         MEM_SCALAR_P (mem) = scalar_p;
+         emit_move_insn (gen_rtx_REG (SImode, base_regno + i), mem);
+       }
+
+      if (write_back)
+       emit_move_insn (from, plus_constant (from, count * 4 * sign));
+
+      seq = gen_sequence ();
+      end_sequence ();
+
+      return seq;
+    }
+
   result = gen_rtx_PARALLEL (VOIDmode,
                             rtvec_alloc (count + (write_back ? 2 : 0)));
   if (write_back)
@@ -3208,6 +3309,32 @@
   int sign = up ? 1 : -1;
   rtx mem;
 
+  /* See arm_gen_load_multiple for discussion of
+     the pros/cons of ldm/stm usage for XScale.  */
+  if (arm_is_xscale && count <= 2 && ! optimize_size)
+    {
+      rtx seq;
+
+      start_sequence ();
+
+      for (i = 0; i < count; i++)
+       {
+         mem = gen_rtx_MEM (SImode, plus_constant (to, i * 4 * sign));
+         RTX_UNCHANGING_P (mem) = unchanging_p;
+         MEM_IN_STRUCT_P (mem) = in_struct_p;
+         MEM_SCALAR_P (mem) = scalar_p;
+         emit_move_insn (mem, gen_rtx_REG (SImode, base_regno + i));
+       }
+
+      if (write_back)
+       emit_move_insn (to, plus_constant (to, count * 4 * sign));
+
+      seq = gen_sequence ();
+      end_sequence ();
+
+      return seq;
+    }
+
   result = gen_rtx_PARALLEL (VOIDmode,
                             rtvec_alloc (count + (write_back ? 2 : 0)));
   if (write_back)
diff -r 56b54675101a -r 55dc241124ae gnu/dist/toolchain/gcc/config/arm/arm.h
--- a/gnu/dist/toolchain/gcc/config/arm/arm.h   Tue Aug 20 23:02:44 2002 +0000
+++ b/gnu/dist/toolchain/gcc/config/arm/arm.h   Tue Aug 20 23:46:44 2002 +0000
@@ -477,6 +477,9 @@
 /* Nonzero if this chip is a StrongARM.  */
 extern int arm_is_strong;
 
+/* Nonzero if this chip is an XScale.  */
+extern int arm_is_xscale;
+
 /* Nonzero if this chip is a an ARM6 or an ARM7.  */
 extern int arm_is_6_or_7;
 
@@ -614,9 +617,12 @@
 #define BIGGEST_ALIGNMENT  32
 
 /* Make strings word-aligned so strcpy from constants will be faster.  */
-#define CONSTANT_ALIGNMENT(EXP, ALIGN)  \
-  (TREE_CODE (EXP) == STRING_CST        \
-   && (ALIGN) < BITS_PER_WORD ? BITS_PER_WORD : (ALIGN))
+#define CONSTANT_ALIGNMENT_FACTOR (! arm_is_xscale ? 1 : 2)
+
+#define CONSTANT_ALIGNMENT(EXP, ALIGN)                         \
+  ((TREE_CODE (EXP) == STRING_CST                              \
+    && (ALIGN) < BITS_PER_WORD * CONSTANT_ALIGNMENT_FACTOR)    \
+   ? BITS_PER_WORD * CONSTANT_ALIGNMENT_FACTOR : (ALIGN))
 
 /* Every structures size must be a multiple of 32 bits.  */
 /* This is for compatibility with ARMCC.  ARM SDT Reference Manual
@@ -1703,6 +1709,9 @@
    in one reasonably fast instruction.  */
 #define MOVE_MAX 4
 
+#undef  MOVE_RATIO
+#define MOVE_RATIO (arm_is_xscale ? 4 : 2)
+
 /* Define if operations between registers always perform the operation
    on the full register even if a narrower mode is specified.  */
 #define WORD_REGISTER_OPERATIONS
diff -r 56b54675101a -r 55dc241124ae gnu/dist/toolchain/gcc/config/arm/arm.md
--- a/gnu/dist/toolchain/gcc/config/arm/arm.md  Tue Aug 20 23:02:44 2002 +0000
+++ b/gnu/dist/toolchain/gcc/config/arm/arm.md  Tue Aug 20 23:46:44 2002 +0000
@@ -48,6 +48,11 @@
 
 (define_attr "is_strongarm" "no,yes" (const (symbol_ref "arm_is_strong")))
 
+;; Operand number of an input operand that is shifted.  Zoer if the
+;; given instruction does not shift one of its input operands.
+(define_attr "is_xscale" "no,yes" (const (symbol_ref "arm_is_xscale")))
+(define_attr "shift" "" (const_int 0))
+
 ; Floating Point Unit.  If we only have floating point emulation, then there
 ; is no point in scheduling the floating point insns.  (Well, for best
 ; performance we should try and group them together).
@@ -238,12 +243,26 @@
 ;; Core unit
 ;;--------------------------------------------------------------------
 ;; Everything must spend at least one cycle in the core unit
+(define_function_unit "core" 1 0 (eq_attr "core_cycles" "single") 1 1)
+
 (define_function_unit "core" 1 0
   (and (eq_attr "ldsched" "yes") (eq_attr "type" "store1")) 1 1)
 
 (define_function_unit "core" 1 0
   (and (eq_attr "ldsched" "yes") (eq_attr "type" "load")) 2 1)
 
+;; We do not need to conditionalize the define_function_unit immediately
+;; above.  This one will be ignored for anything other than xscale
+;; compiles and for xscale compiles it provides a larger delay
+;; and the scheduler will DTRT.
+;; FIXME: this test need to be revamped to not depend on this feature
+;; of the scheduler.
+
+(define_function_unit "core" 1 0
+  (and (and (eq_attr "ldsched" "yes") (eq_attr "type" "load"))
+       (eq_attr "is_xscale" "yes"))
+   3 1)
+
 (define_function_unit "core" 1 0
   (and (eq_attr "ldsched" "!yes") (eq_attr "type" "load,store1")) 2 2)
 
@@ -275,6 +294,10 @@
 (define_function_unit "core" 1 0 (eq_attr "type" "store3") 4 4)
 
 (define_function_unit "core" 1 0 (eq_attr "type" "store4") 5 5)
+
+(define_function_unit "core" 1 0
+  (and (eq_attr "core_cycles" "multi")
+       (eq_attr "type" "!mult,load,store1,store2,store3,store4")) 32 32)
 
 ;; Note: For DImode insns, there is normally no reason why operands should
 ;; not be in the same register, what we don't want is for something being
@@ -1410,7 +1433,9 @@
                          (match_operand:SI 3 "arm_rhs_operand" "rM")]))
                (match_operand:SI 1 "s_register_operand" "r")))]
   ""
-  "bic%?\\t%0, %1, %2%S4")
+  "bic%?\\t%0, %1, %2%S4"
+  [(set_attr "shift" "2")]
+)
 
 (define_insn "*andsi_notsi_si_compare0"
   [(set (reg:CC_NOOV 24)
@@ -1783,7 +1808,9 @@
         [(match_operand:SI 1 "s_register_operand" "r")
          (match_operand:SI 2 "reg_or_int_operand" "rM")]))]
   ""
-  "mov%?\\t%0, %1%S3")
+  "mov%?\\t%0, %1%S3"
+  [(set_attr "shift" "1")]
+)
Prev by Date: [src/trunk]: src/share/man/man4 PR/17402: Julio Merino: Mention /dev/ttyEstat
Next by Date: [src/trunk]: src/sys/arch * Add PTE_SYNC() and PTE_SYNC_RANGE() macros. Thes...
Previous by Thread: [src/trunk]: src/share/man/man4 PR/17402: Julio Merino: Mention /dev/ttyEstat
Next by Thread: [src/trunk]: src/sys/arch * Add PTE_SYNC() and PTE_SYNC_RANGE() macros. Thes...
Indexes:
Home | Main Index | Thread Index | Old Index