From 7c946199cd5eab2917bb053ca6fdc6997d27aa7a Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Fri, 1 Jul 2011 16:47:13 +1000
Subject: m68k: merge and clean up delay.h files

The real difference between the mmu and non-mmu varients of the delay.h
files has nothing to do with having an mmu or not. It is processor family
differences that means slightly different code. Merge the delay_mm.h and
delay_no.h files back into a single file.

The primarly difference we need to deal with is whether the processor
supports a 32bit * 32bit -> 64bit multiply. Without it we need to do some
shift scaling as well as use a 32bit * 32bit -> 32bit multiply. If building
for a multi-CPU type kernel then we must use the simpler mult/shift scaling.

This version of delay code allows the CPU32 family to use a 64bit mul,
since it supports this instruction, the old code did not.

The changes use macros where appropriate to try and optimize constant sized
udelay times. And it removes the use of a fixed lib function for the non-mmu
case. Code size on typical kernel configurations is similar, or only larger
by a few tens of bytes.

Also removed the unused muldiv() code from delay_mm.h.

Build and run tested on ColdFire and ARAnyM. Build tested only on 68328
and 68360 (CPU32).

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/delay.h | 97 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 94 insertions(+), 3 deletions(-)

(limited to 'arch/m68k/include/asm/delay.h')

diff --git a/arch/m68k/include/asm/delay.h b/arch/m68k/include/asm/delay.h
index d2598e3..9c09bec 100644
--- a/arch/m68k/include/asm/delay.h
+++ b/arch/m68k/include/asm/delay.h
@@ -1,5 +1,96 @@
-#ifdef __uClinux__
-#include "delay_no.h"
+#ifndef _M68K_DELAY_H
+#define _M68K_DELAY_H
+
+#include <asm/param.h>
+
+/*
+ * Copyright (C) 1994 Hamish Macdonald
+ * Copyright (C) 2004 Greg Ungerer <gerg@uclinux.com>
+ *
+ * Delay routines, using a pre-computed "loops_per_jiffy" value.
+ */
+
+#if defined(CONFIG_COLDFIRE)
+/*
+ * The ColdFire runs the delay loop at significantly different speeds
+ * depending upon long word alignment or not.  We'll pad it to
+ * long word alignment which is the faster version.
+ * The 0x4a8e is of course a 'tstl %fp' instruction.  This is better
+ * than using a NOP (0x4e71) instruction because it executes in one
+ * cycle not three and doesn't allow for an arbitrary delay waiting
+ * for bus cycles to finish.  Also fp/a6 isn't likely to cause a
+ * stall waiting for the register to become valid if such is added
+ * to the coldfire at some stage.
+ */
+#define	DELAY_ALIGN	".balignw 4, 0x4a8e\n\t"
 #else
-#include "delay_mm.h"
+/*
+ * No instruction alignment required for other m68k types.
+ */
+#define	DELAY_ALIGN
 #endif
+
+static inline void __delay(unsigned long loops)
+{
+	__asm__ __volatile__ (
+		DELAY_ALIGN
+		"1: subql #1,%0\n\t"
+		"jcc 1b"
+		: "=d" (loops)
+		: "0" (loops));
+}
+
+extern void __bad_udelay(void);
+
+
+#if defined(CONFIG_M68000) || defined(CONFIG_COLDFIRE)
+/*
+ * The simpler m68k and ColdFire processors do not have a 32*32->64
+ * multiply instruction. So we need to handle them a little differently.
+ * We use a bit of shifting and a single 32*32->32 multiply to get close.
+ * This is a macro so that the const version can factor out the first
+ * multiply and shift.
+ */
+#define	HZSCALE		(268435456 / (1000000 / HZ))
+
+#define	__const_udelay(u) \
+	__delay(((((u) * HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6)
+
+#else
+
+static inline void __xdelay(unsigned long xloops)
+{
+	unsigned long tmp;
+
+	__asm__ ("mulul %2,%0:%1"
+		: "=d" (xloops), "=d" (tmp)
+		: "d" (xloops), "1" (loops_per_jiffy));
+	__delay(xloops * HZ);
+}
+
+/*
+ * The definition of __const_udelay is specifically made a macro so that
+ * the const factor (4295 = 2**32 / 1000000) can be optimized out when
+ * the delay is a const.
+ */
+#define	__const_udelay(n)	(__xdelay((n) * 4295))
+
+#endif
+
+static inline void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs);
+}
+
+/*
+ * Use only for very small delays ( < 1 msec).  Should probably use a
+ * lookup table, really, as the multiplications take much too long with
+ * short delays.  This is a "reasonable" implementation, though (and the
+ * first constant multiplications gets optimized away if the delay is
+ * a constant)
+ */
+#define udelay(n) (__builtin_constant_p(n) ? \
+	((n) > 20000 ? __bad_udelay() : __const_udelay(n)) : __udelay(n))
+
+
+#endif /* defined(_M68K_DELAY_H) */
-- 
cgit v1.1