aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/CodingStyle17
-rw-r--r--Documentation/DocBook/Makefile19
-rw-r--r--Documentation/DocBook/kernel-api.tmpl63
-rw-r--r--Documentation/DocBook/librs.tmpl16
-rw-r--r--Documentation/DocBook/man/Makefile3
-rw-r--r--Documentation/SubmittingDrivers15
-rw-r--r--Documentation/accounting/getdelays.c20
-rw-r--r--Documentation/blackfin/00-INDEX11
-rw-r--r--Documentation/blackfin/Filesystems169
-rw-r--r--Documentation/blackfin/cache-lock.txt48
-rw-r--r--Documentation/blackfin/cachefeatures.txt65
-rw-r--r--Documentation/cciss.txt13
-rw-r--r--Documentation/fb/deferred_io.txt75
-rw-r--r--Documentation/fb/s3fb.txt12
-rw-r--r--Documentation/feature-removal-schedule.txt30
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/jfs.txt8
-rw-r--r--Documentation/filesystems/proc.txt38
-rw-r--r--Documentation/filesystems/vfat.txt7
-rw-r--r--Documentation/filesystems/vfs.txt23
-rw-r--r--Documentation/hwmon/coretemp36
-rw-r--r--Documentation/hwmon/max665053
-rw-r--r--Documentation/hwmon/smsc47m111
-rw-r--r--Documentation/hwmon/smsc47m1927
-rw-r--r--Documentation/hwmon/sysfs-interface7
-rw-r--r--Documentation/ia64/aliasing-test.c247
-rw-r--r--Documentation/ia64/aliasing.txt71
-rw-r--r--Documentation/ia64/err_inject.txt1068
-rw-r--r--Documentation/ioctl-number.txt3
-rw-r--r--Documentation/kbuild/modules.txt2
-rw-r--r--Documentation/kernel-parameters.txt40
-rw-r--r--Documentation/kprobes.txt34
-rw-r--r--Documentation/laptop-mode.txt2
-rw-r--r--Documentation/oops-tracing.txt3
-rw-r--r--Documentation/pcmcia/driver.txt30
-rw-r--r--Documentation/power/basic-pm-debugging.txt106
-rw-r--r--Documentation/power/drivers-testing.txt42
-rw-r--r--Documentation/power/interface.txt8
-rw-r--r--Documentation/powerpc/booting-without-of.txt4
-rw-r--r--Documentation/rtc.txt7
-rw-r--r--Documentation/sh/clk.txt32
-rw-r--r--Documentation/spi/pxa2xx2
-rw-r--r--Documentation/spi/spi-summary43
-rw-r--r--Documentation/spi/spidev307
-rw-r--r--Documentation/sysctl/vm.txt23
-rw-r--r--Documentation/sysrq.txt4
-rw-r--r--Documentation/tty.txt4
-rw-r--r--Documentation/vm/slabinfo.c943
-rw-r--r--Documentation/vm/slub.txt113
49 files changed, 3759 insertions, 147 deletions
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index 9069189..afc2867 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -160,6 +160,21 @@ supply of new-lines on your screen is not a renewable resource (think
25-line terminal screens here), you have more empty lines to put
comments on.
+Do not unnecessarily use braces where a single statement will do.
+
+if (condition)
+ action();
+
+This does not apply if one branch of a conditional statement is a single
+statement. Use braces in both branches.
+
+if (condition) {
+ do_this();
+ do_that();
+} else {
+ otherwise();
+}
+
3.1: Spaces
Linux kernel style for use of spaces depends (mostly) on
@@ -625,7 +640,7 @@ language.
There appears to be a common misperception that gcc has a magic "make me
faster" speedup option called "inline". While the use of inlines can be
-appropriate (for example as a means of replacing macros, see Chapter 11), it
+appropriate (for example as a means of replacing macros, see Chapter 12), it
very often is not. Abundant use of the inline keyword leads to a much bigger
kernel, which in turn slows the system as a whole down, due to a bigger
icache footprint for the CPU and simply because there is less memory
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 867608a..6fd1646 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -41,8 +41,9 @@ psdocs: $(PS)
PDF := $(patsubst %.xml, %.pdf, $(BOOKS))
pdfdocs: $(PDF)
-HTML := $(patsubst %.xml, %.html, $(BOOKS))
+HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS)))
htmldocs: $(HTML)
+ $(call build_main_index)
MAN := $(patsubst %.xml, %.9, $(BOOKS))
mandocs: $(MAN)
@@ -132,10 +133,17 @@ quiet_cmd_db2pdf = PDF $@
%.pdf : %.xml
$(call cmd,db2pdf)
+
+main_idx = Documentation/DocBook/index.html
+build_main_index = rm -rf $(main_idx) && \
+ echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \
+ echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \
+ cat $(HTML) >> $(main_idx)
+
quiet_cmd_db2html = HTML $@
cmd_db2html = xmlto xhtml $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \
echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \
- Goto $(patsubst %.html,%,$(notdir $@))</a><p>' > $@
+ $(patsubst %.html,%,$(notdir $@))</a><p>' > $@
%.html: %.xml
@(which xmlto > /dev/null 2>&1) || \
@@ -152,6 +160,7 @@ quiet_cmd_db2man = MAN $@
@(which xmlto > /dev/null 2>&1) || \
(echo "*** You need to install xmlto ***"; \
exit 1)
+ $(Q)mkdir -p $(obj)/man
$(call cmd,db2man)
@touch $@
@@ -212,11 +221,7 @@ clean-files := $(DOCBOOKS) \
$(patsubst %.xml, %.9, $(DOCBOOKS)) \
$(C-procfs-example)
-clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS))
-
-#man put files in man subdir - traverse down
-subdir- := man/
-
+clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
# Declare the contents of the .PHONY variable as phony. We keep that
# information in a variable se we can use it in if_changed and friends.
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index b61dfc7..a2b2b4d 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -576,4 +576,67 @@ X!Idrivers/video/console/fonts.c
!Edrivers/input/ff-core.c
!Edrivers/input/ff-memless.c
</chapter>
+
+ <chapter id="spi">
+ <title>Serial Peripheral Interface (SPI)</title>
+ <para>
+ SPI is the "Serial Peripheral Interface", widely used with
+ embedded systems because it is a simple and efficient
+ interface: basically a multiplexed shift register.
+ Its three signal wires hold a clock (SCK, often in the range
+ of 1-20 MHz), a "Master Out, Slave In" (MOSI) data line, and
+ a "Master In, Slave Out" (MISO) data line.
+ SPI is a full duplex protocol; for each bit shifted out the
+ MOSI line (one per clock) another is shifted in on the MISO line.
+ Those bits are assembled into words of various sizes on the
+ way to and from system memory.
+ An additional chipselect line is usually active-low (nCS);
+ four signals are normally used for each peripheral, plus
+ sometimes an interrupt.
+ </para>
+ <para>
+ The SPI bus facilities listed here provide a generalized
+ interface to declare SPI busses and devices, manage them
+ according to the standard Linux driver model, and perform
+ input/output operations.
+ At this time, only "master" side interfaces are supported,
+ where Linux talks to SPI peripherals and does not implement
+ such a peripheral itself.
+ (Interfaces to support implementing SPI slaves would
+ necessarily look different.)
+ </para>
+ <para>
+ The programming interface is structured around two kinds of driver,
+ and two kinds of device.
+ A "Controller Driver" abstracts the controller hardware, which may
+ be as simple as a set of GPIO pins or as complex as a pair of FIFOs
+ connected to dual DMA engines on the other side of the SPI shift
+ register (maximizing throughput). Such drivers bridge between
+ whatever bus they sit on (often the platform bus) and SPI, and
+ expose the SPI side of their device as a
+ <structname>struct spi_master</structname>.
+ SPI devices are children of that master, represented as a
+ <structname>struct spi_device</structname> and manufactured from
+ <structname>struct spi_board_info</structname> descriptors which
+ are usually provided by board-specific initialization code.
+ A <structname>struct spi_driver</structname> is called a
+ "Protocol Driver", and is bound to a spi_device using normal
+ driver model calls.
+ </para>
+ <para>
+ The I/O model is a set of queued messages. Protocol drivers
+ submit one or more <structname>struct spi_message</structname>
+ objects, which are processed and completed asynchronously.
+ (There are synchronous wrappers, however.) Messages are
+ built from one or more <structname>struct spi_transfer</structname>
+ objects, each of which wraps a full duplex SPI transfer.
+ A variety of protocol tweaking options are needed, because
+ different chips adopt very different policies for how they
+ use the bits transferred with SPI.
+ </para>
+!Iinclude/linux/spi/spi.h
+!Fdrivers/spi/spi.c spi_register_board_info
+!Edrivers/spi/spi.c
+ </chapter>
+
</book>
diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl
index 3ff39ba..94f2136 100644
--- a/Documentation/DocBook/librs.tmpl
+++ b/Documentation/DocBook/librs.tmpl
@@ -79,12 +79,12 @@
<chapter id="usage">
<title>Usage</title>
<para>
- This chapter provides examples how to use the library.
+ This chapter provides examples of how to use the library.
</para>
<sect1>
<title>Initializing</title>
<para>
- The init function init_rs returns a pointer to a
+ The init function init_rs returns a pointer to an
rs decoder structure, which holds the necessary
information for encoding, decoding and error correction
with the given polynomial. It either uses an existing
@@ -98,10 +98,10 @@
static struct rs_control *rs_decoder;
/* Symbolsize is 10 (bits)
- * Primitve polynomial is x^10+x^3+1
+ * Primitive polynomial is x^10+x^3+1
* first consecutive root is 0
- * primitve element to generate roots = 1
- * generator polinomial degree (number of roots) = 6
+ * primitive element to generate roots = 1
+ * generator polynomial degree (number of roots) = 6
*/
rs_decoder = init_rs (10, 0x409, 0, 1, 6);
</programlisting>
@@ -116,12 +116,12 @@ rs_decoder = init_rs (10, 0x409, 0, 1, 6);
</para>
<para>
The expanded data can be inverted on the fly by
- providing a non zero inversion mask. The expanded data is
+ providing a non-zero inversion mask. The expanded data is
XOR'ed with the mask. This is used e.g. for FLASH
ECC, where the all 0xFF is inverted to an all 0x00.
The Reed-Solomon code for all 0x00 is all 0x00. The
code is inverted before storing to FLASH so it is 0xFF
- too. This prevent's that reading from an erased FLASH
+ too. This prevents that reading from an erased FLASH
results in ECC errors.
</para>
<para>
@@ -273,7 +273,7 @@ free_rs(rs_decoder);
May be used under the terms of the GNU General Public License (GPL)
</programlisting>
<para>
- The wrapper functions and interfaces are written by Thomas Gleixner
+ The wrapper functions and interfaces are written by Thomas Gleixner.
</para>
<para>
Many users have provided bugfixes, improvements and helping hands for testing.
diff --git a/Documentation/DocBook/man/Makefile b/Documentation/DocBook/man/Makefile
deleted file mode 100644
index 4fb7ea0..0000000
--- a/Documentation/DocBook/man/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-# Rules are put in Documentation/DocBook
-
-clean-files := *.9.gz *.sgml manpage.links manpage.refs
diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers
index 58bead0..d7e2642 100644
--- a/Documentation/SubmittingDrivers
+++ b/Documentation/SubmittingDrivers
@@ -87,6 +87,21 @@ Clarity: It helps if anyone can see how to fix the driver. It helps
driver that intentionally obfuscates how the hardware works
it will go in the bitbucket.
+PM support: Since Linux is used on many portable and desktop systems, your
+ driver is likely to be used on such a system and therefore it
+ should support basic power management by implementing, if
+ necessary, the .suspend and .resume methods used during the
+ system-wide suspend and resume transitions. You should verify
+ that your driver correctly handles the suspend and resume, but
+ if you are unable to ensure that, please at least define the
+ .suspend method returning the -ENOSYS ("Function not
+ implemented") error. You should also try to make sure that your
+ driver uses as little power as possible when it's not doing
+ anything. For the driver testing instructions see
+ Documentation/power/drivers-testing.txt and for a relatively
+ complete overview of the power management issues related to
+ drivers see Documentation/power/devices.txt .
+
Control: In general if there is active maintainance of a driver by
the author then patches will be redirected to them unless
they are totally obvious and without need of checking.
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index e9126e7..71acc28 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -61,8 +61,6 @@ __u64 stime, utime;
#define MAX_MSG_SIZE 1024
/* Maximum number of cpus expected to be specified in a cpumask */
#define MAX_CPUS 32
-/* Maximum length of pathname to log file */
-#define MAX_FILENAME 256
struct msgtemplate {
struct nlmsghdr n;
@@ -72,6 +70,16 @@ struct msgtemplate {
char cpumask[100+6*MAX_CPUS];
+static void usage(void)
+{
+ fprintf(stderr, "getdelays [-dilv] [-w logfile] [-r bufsize] "
+ "[-m cpumask] [-t tgid] [-p pid]\n");
+ fprintf(stderr, " -d: print delayacct stats\n");
+ fprintf(stderr, " -i: print IO accounting (works only with -p)\n");
+ fprintf(stderr, " -l: listen forever\n");
+ fprintf(stderr, " -v: debug on\n");
+}
+
/*
* Create a raw netlink socket and bind
*/
@@ -221,13 +229,13 @@ int main(int argc, char *argv[])
int count = 0;
int write_file = 0;
int maskset = 0;
- char logfile[128];
+ char *logfile = NULL;
int loop = 0;
struct msgtemplate msg;
while (1) {
- c = getopt(argc, argv, "diw:r:m:t:p:v:l");
+ c = getopt(argc, argv, "diw:r:m:t:p:vl");
if (c < 0)
break;
@@ -241,7 +249,7 @@ int main(int argc, char *argv[])
print_io_accounting = 1;
break;
case 'w':
- strncpy(logfile, optarg, MAX_FILENAME);
+ logfile = strdup(optarg);
printf("write to file %s\n", logfile);
write_file = 1;
break;
@@ -277,7 +285,7 @@ int main(int argc, char *argv[])
loop = 1;
break;
default:
- printf("Unknown option %d\n", c);
+ usage();
exit(-1);
}
}
diff --git a/Documentation/blackfin/00-INDEX b/Documentation/blackfin/00-INDEX
new file mode 100644
index 0000000..7cb3b35
--- /dev/null
+++ b/Documentation/blackfin/00-INDEX
@@ -0,0 +1,11 @@
+00-INDEX
+ - This file
+
+cache-lock.txt
+ - HOWTO for blackfin cache locking.
+
+cachefeatures.txt
+ - Supported cache features.
+
+Filesystems
+ - Requirements for mounting the root file system.
diff --git a/Documentation/blackfin/Filesystems b/Documentation/blackfin/Filesystems
new file mode 100644
index 0000000..51260a1
--- /dev/null
+++ b/Documentation/blackfin/Filesystems
@@ -0,0 +1,169 @@
+/*
+ * File: Documentation/blackfin/Filesystems
+ * Based on:
+ * Author:
+ *
+ * Created:
+ * Description: This file contains the simple DMA Implementation for Blackfin
+ *
+ * Rev: $Id: Filesystems 2384 2006-11-01 04:12:43Z magicyang $
+ *
+ * Modified:
+ * Copyright 2004-2006 Analog Devices Inc.
+ *
+ * Bugs: Enter bugs at http://blackfin.uclinux.org/
+ *
+ */
+
+ How to mount the root file system in uClinux/Blackfin
+ -----------------------------------------------------
+
+1 Mounting EXT3 File system.
+ ------------------------
+
+ Creating an EXT3 File system for uClinux/Blackfin:
+
+
+Please follow the steps to form the EXT3 File system and mount the same as root
+file system.
+
+a Make an ext3 file system as large as you want the final root file
+ system.
+
+ mkfs.ext3 /dev/ram0 <your-rootfs-size-in-1k-blocks>
+
+b Mount this Empty file system on a free directory as:
+
+ mount -t ext3 /dev/ram0 ./test
+ where ./test is the empty directory.
+
+c Copy your root fs directory that you have so carefully made over.
+
+ cp -af /tmp/my_final_rootfs_files/* ./test
+
+ (For ex: cp -af uClinux-dist/romfs/* ./test)
+
+d If you have done everything right till now you should be able to see
+ the required "root" dir's (that's etc, root, bin, lib, sbin...)
+
+e Now unmount the file system
+
+ umount ./test
+
+f Create the root file system image.
+
+ dd if=/dev/ram0 bs=1k count=<your-rootfs-size-in-1k-blocks> \
+ > ext3fs.img
+
+
+Now you have to tell the kernel that will be mounting this file system as
+rootfs.
+So do a make menuconfig under kernel and select the Ext3 journaling file system
+support under File system --> submenu.
+
+
+2. Mounting EXT2 File system.
+ -------------------------
+
+By default the ext2 file system image will be created if you invoke make from
+the top uClinux-dist directory.
+
+
+3. Mounting CRAMFS File System
+ ----------------------------
+
+To create a CRAMFS file system image execute the command
+
+ mkfs.cramfs ./test cramfs.img
+
+ where ./test is the target directory.
+
+
+4. Mounting ROMFS File System
+ --------------------------
+
+To create a ROMFS file system image execute the command
+
+ genromfs -v -V "ROMdisk" -f romfs.img -d ./test
+
+ where ./test is the target directory
+
+
+5. Mounting the JFFS2 Filesystem
+ -----------------------------
+
+To create a compressed JFFS filesystem (JFFS2), please execute the command
+
+ mkfs.jffs2 -d ./test -o jffs2.img
+
+ where ./test is the target directory.
+
+However, please make sure the following is in your kernel config.
+
+/*
+ * RAM/ROM/Flash chip drivers
+ */
+#define CONFIG_MTD_CFI 1
+#define CONFIG_MTD_ROM 1
+/*
+ * Mapping drivers for chip access
+ */
+#define CONFIG_MTD_COMPLEX_MAPPINGS 1
+#define CONFIG_MTD_BF533 1
+#undef CONFIG_MTD_UCLINUX
+
+Through the u-boot boot loader, use the jffs2.img in the corresponding
+partition made in linux-2.6.x/drivers/mtd/maps/bf533_flash.c.
+
+NOTE - Currently the Flash driver is available only for EZKIT. Watch out for a
+ STAMP driver soon.
+
+
+6. Mounting the NFS File system
+ -----------------------------
+
+ For mounting the NFS please do the following in the kernel config.
+
+ In Networking Support --> Networking options --> TCP/IP networking -->
+ IP: kernel level autoconfiguration
+
+ Enable BOOTP Support.
+
+ In Kernel hacking --> Compiled-in kernel boot parameter add the following
+
+ root=/dev/nfs rw ip=bootp
+
+ In File system --> Network File system, Enable
+
+ NFS file system support --> NFSv3 client support
+ Root File system on NFS
+
+ in uClibc menuconfig, do the following
+ In Networking Support
+ enable Remote Procedure Call (RPC) support
+ Full RPC Support
+
+ On the Host side, ensure that /etc/dhcpd.conf looks something like this
+
+ ddns-update-style ad-hoc;
+ allow bootp;
+ subnet 10.100.4.0 netmask 255.255.255.0 {
+ default-lease-time 122209600;
+ max-lease-time 31557600;
+ group {
+ host bf533 {
+ hardware ethernet 00:CF:52:49:C3:01;
+ fixed-address 10.100.4.50;
+ option root-path "/home/nfsmount";
+ }
+ }
+
+ ensure that /etc/exports looks something like this
+ /home/nfsmount *(rw,no_root_squash,no_all_squash)
+
+ run the following commands as root (may differ depending on your
+ distribution) :
+ - service nfs start
+ - service portmap start
+ - service dhcpd start
+ - /usr/sbin/exportfs
diff --git a/Documentation/blackfin/cache-lock.txt b/Documentation/blackfin/cache-lock.txt
new file mode 100644
index 0000000..88ba1e6
--- /dev/null
+++ b/Documentation/blackfin/cache-lock.txt
@@ -0,0 +1,48 @@
+/*
+ * File: Documentation/blackfin/cache-lock.txt
+ * Based on:
+ * Author:
+ *
+ * Created:
+ * Description: This file contains the simple DMA Implementation for Blackfin
+ *
+ * Rev: $Id: cache-lock.txt 2384 2006-11-01 04:12:43Z magicyang $
+ *
+ * Modified:
+ * Copyright 2004-2006 Analog Devices Inc.
+ *
+ * Bugs: Enter bugs at http://blackfin.uclinux.org/
+ *
+ */
+
+How to lock your code in cache in uClinux/blackfin
+--------------------------------------------------
+
+There are only a few steps required to lock your code into the cache.
+Currently you can lock the code by Way.
+
+Below are the interface provided for locking the cache.
+
+
+1. cache_grab_lock(int Ways);
+
+This function grab the lock for locking your code into the cache specified
+by Ways.
+
+
+2. cache_lock(int Ways);
+
+This function should be called after your critical code has been executed.
+Once the critical code exits, the code is now loaded into the cache. This
+function locks the code into the cache.
+
+
+So, the example sequence will be:
+
+ cache_grab_lock(WAY0_L); /* Grab the lock */
+
+ critical_code(); /* Execute the code of interest */
+
+ cache_lock(WAY0_L); /* Lock the cache */
+
+Where WAY0_L signifies WAY0 locking.
diff --git a/Documentation/blackfin/cachefeatures.txt b/Documentation/blackfin/cachefeatures.txt
new file mode 100644
index 0000000..0fbec23
--- /dev/null
+++ b/Documentation/blackfin/cachefeatures.txt
@@ -0,0 +1,65 @@
+/*
+ * File: Documentation/blackfin/cachefeatures.txt
+ * Based on:
+ * Author:
+ *
+ * Created:
+ * Description: This file contains the simple DMA Implementation for Blackfin
+ *
+ * Rev: $Id: cachefeatures.txt 2384 2006-11-01 04:12:43Z magicyang $
+ *
+ * Modified:
+ * Copyright 2004-2006 Analog Devices Inc.
+ *
+ * Bugs: Enter bugs at http://blackfin.uclinux.org/
+ *
+ */
+
+ - Instruction and Data cache initialization.
+ icache_init();
+ dcache_init();
+
+ - Instruction and Data cache Invalidation Routines, when flushing the
+ same is not required.
+ _icache_invalidate();
+ _dcache_invalidate();
+
+ Also, for invalidating the entire instruction and data cache, the below
+ routines are provided (another method for invalidation, refer page no 267 and 287 of
+ ADSP-BF533 Hardware Reference manual)
+
+ invalidate_entire_dcache();
+ invalidate_entire_icache();
+
+ -External Flushing of Instruction and data cache routines.
+
+ flush_instruction_cache();
+ flush_data_cache();
+
+ - Internal Flushing of Instruction and Data Cache.
+
+ icplb_flush();
+ dcplb_flush();
+
+ - Locking the cache.
+
+ cache_grab_lock();
+ cache_lock();
+
+ Please refer linux-2.6.x/Documentation/blackfin/cache-lock.txt for how to
+ lock the cache.
+
+ Locking the cache is optional feature.
+
+ - Miscellaneous cache functions.
+
+ flush_cache_all();
+ flush_cache_mm();
+ invalidate_dcache_range();
+ flush_dcache_range();
+ flush_dcache_page();
+ flush_cache_range();
+ flush_cache_page();
+ invalidate_dcache_range();
+ flush_page_to_ram();
+
diff --git a/Documentation/cciss.txt b/Documentation/cciss.txt
index f74affe..e65736c 100644
--- a/Documentation/cciss.txt
+++ b/Documentation/cciss.txt
@@ -22,14 +22,21 @@ This driver is known to work with the following cards:
* SA E200i
* SA E500
-If nodes are not already created in the /dev/cciss directory, run as root:
+Detecting drive failures:
+-------------------------
-# cd /dev
-# ./MAKEDEV cciss
+To get the status of logical volumes and to detect physical drive
+failures, you can use the cciss_vol_status program found here:
+http://cciss.sourceforge.net/#cciss_utils
Device Naming:
--------------
+If nodes are not already created in the /dev/cciss directory, run as root:
+
+# cd /dev
+# ./MAKEDEV cciss
+
You need some entries in /dev for the cciss device. The MAKEDEV script
can make device nodes for you automatically. Currently the device setup
is as follows:
diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt
new file mode 100644
index 0000000..73cf9fb
--- /dev/null
+++ b/Documentation/fb/deferred_io.txt
@@ -0,0 +1,75 @@
+Deferred IO
+-----------
+
+Deferred IO is a way to delay and repurpose IO. It uses host memory as a
+buffer and the MMU pagefault as a pretrigger for when to perform the device
+IO. The following example may be a useful explaination of how one such setup
+works:
+
+- userspace app like Xfbdev mmaps framebuffer
+- deferred IO and driver sets up nopage and page_mkwrite handlers
+- userspace app tries to write to mmaped vaddress
+- we get pagefault and reach nopage handler
+- nopage handler finds and returns physical page
+- we get page_mkwrite where we add this page to a list
+- schedule a workqueue task to be run after a delay
+- app continues writing to that page with no additional cost. this is
+ the key benefit.
+- the workqueue task comes in and mkcleans the pages on the list, then
+ completes the work associated with updating the framebuffer. this is
+ the real work talking to the device.
+- app tries to write to the address (that has now been mkcleaned)
+- get pagefault and the above sequence occurs again
+
+As can be seen from above, one benefit is roughly to allow bursty framebuffer
+writes to occur at minimum cost. Then after some time when hopefully things
+have gone quiet, we go and really update the framebuffer which would be
+a relatively more expensive operation.
+
+For some types of nonvolatile high latency displays, the desired image is
+the final image rather than the intermediate stages which is why it's okay
+to not update for each write that is occuring.
+
+It may be the case that this is useful in other scenarios as well. Paul Mundt
+has mentioned a case where it is beneficial to use the page count to decide
+whether to coalesce and issue SG DMA or to do memory bursts.
+
+Another one may be if one has a device framebuffer that is in an usual format,
+say diagonally shifting RGB, this may then be a mechanism for you to allow
+apps to pretend to have a normal framebuffer but reswizzle for the device
+framebuffer at vsync time based on the touched pagelist.
+
+How to use it: (for applications)
+---------------------------------
+No changes needed. mmap the framebuffer like normal and just use it.
+
+How to use it: (for fbdev drivers)
+----------------------------------
+The following example may be helpful.
+
+1. Setup your structure. Eg:
+
+static struct fb_deferred_io hecubafb_defio = {
+ .delay = HZ,
+ .deferred_io = hecubafb_dpy_deferred_io,
+};
+
+The delay is the minimum delay between when the page_mkwrite trigger occurs
+and when the deferred_io callback is called. The deferred_io callback is
+explained below.
+
+2. Setup your deferred IO callback. Eg:
+static void hecubafb_dpy_deferred_io(struct fb_info *info,
+ struct list_head *pagelist)
+
+The deferred_io callback is where you would perform all your IO to the display
+device. You receive the pagelist which is the list of pages that were written
+to during the delay. You must not modify this list. This callback is called
+from a workqueue.
+
+3. Call init
+ info->fbdefio = &hecubafb_defio;
+ fb_deferred_io_init(info);
+
+4. Call cleanup
+ fb_deferred_io_cleanup(info);
diff --git a/Documentation/fb/s3fb.txt b/Documentation/fb/s3fb.txt
index 8a04c0d..2c97770 100644
--- a/Documentation/fb/s3fb.txt
+++ b/Documentation/fb/s3fb.txt
@@ -35,10 +35,12 @@ Supported Features
* suspend/resume support
* DPMS support
-Text mode is supported even in higher resolutions, but there is limitation
-to lower pixclocks (maximum between 50-60 MHz, depending on specific hardware).
-This limitation is not enforced by driver. Text mode supports 8bit wide fonts
-only (hardware limitation) and 16bit tall fonts (driver limitation).
+Text mode is supported even in higher resolutions, but there is limitation to
+lower pixclocks (maximum usually between 50-60 MHz, depending on specific
+hardware, i get best results from plain S3 Trio32 card - about 75 MHz). This
+limitation is not enforced by driver. Text mode supports 8bit wide fonts only
+(hardware limitation) and 16bit tall fonts (driver limitation). Text mode
+support is broken on S3 Trio64 V2/DX.
There are two 4 bpp modes. First mode (selected if nonstd == 0) is mode with
packed pixels, high nibble first. Second mode (selected if nonstd == 1) is mode
@@ -73,6 +75,8 @@ Known bugs
==========
* cursor disable in text mode doesn't work
+ * text mode broken on S3 Trio64 V2/DX
+
--
Ondrej Zajicek <santiago@crfreenet.org>
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 94fd03f..676b798 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,14 @@ be removed from this file.
---------------------------
+What: MXSER
+When: December 2007
+Why: Old mxser driver is obsoleted by the mxser_new. Give it some time yet
+ and remove it.
+Who: Jiri Slaby <jirislaby@gmail.com>
+
+---------------------------
+
What: V4L2 VIDIOC_G_MPEGCOMP and VIDIOC_S_MPEGCOMP
When: October 2007
Why: Broken attempt to set MPEG compression parameters. These ioctls are
@@ -126,18 +134,6 @@ Who: Adrian Bunk <bunk@stusta.de>
---------------------------
-What: Usage of invalid timevals in setitimer
-When: March 2007
-Why: POSIX requires to validate timevals in the setitimer call. This
- was never done by Linux. The invalid (e.g. negative timevals) were
- silently converted to more or less random timeouts and intervals.
- Until the removal a per boot limited number of warnings is printed
- and the timevals are sanitized.
-
-Who: Thomas Gleixner <tglx@linutronix.de>
-
----------------------------
-
What: Unused EXPORT_SYMBOL/EXPORT_SYMBOL_GPL exports
(temporary transition config option provided until then)
The transition config option will also be removed at the same time.
@@ -165,7 +161,7 @@ Who: Greg Kroah-Hartman <gregkh@suse.de>
---------------------------
What: Interrupt only SA_* flags
-When: Januar 2007
+When: September 2007
Why: The interrupt related SA_* flags are replaced by IRQF_* to move them
out of the signal namespace.
@@ -332,3 +328,11 @@ Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific
Who: Jean Delvare <khali@linux-fr.org>
---------------------------
+
+What: drivers depending on OSS_OBSOLETE
+When: options in 2.6.23, code in 2.6.25
+Why: obsolete OSS drivers
+Who: Adrian Bunk <bunk@stusta.de>
+
+---------------------------
+
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 28bfea7..59c1415 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -15,6 +15,7 @@ prototypes:
int (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
+ char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
locking rules:
none have BKL
@@ -25,6 +26,7 @@ d_compare: no yes no no
d_delete: yes no yes no
d_release: no no no yes
d_iput: no no no yes
+d_dname: no no no no
--------------------------- inode_operations ---------------------------
prototypes:
diff --git a/Documentation/filesystems/jfs.txt b/Documentation/filesystems/jfs.txt
index bae1286..26ebde7 100644
--- a/Documentation/filesystems/jfs.txt
+++ b/Documentation/filesystems/jfs.txt
@@ -29,7 +29,13 @@ errors=continue Keep going on a filesystem error.
errors=remount-ro Default. Remount the filesystem read-only on an error.
errors=panic Panic and halt the machine if an error occurs.
-Please send bugs, comments, cards and letters to shaggy@austin.ibm.com.
+uid=value Override on-disk uid with specified value
+gid=value Override on-disk gid with specified value
+umask=value Override on-disk umask with specified octal value. For
+ directories, the execute bit will be set if the corresponding
+ read bit is set.
+
+Please send bugs, comments, cards and letters to shaggy@linux.vnet.ibm.com.
The JFS mailing list can be subscribed to by using the link labeled
"Mail list Subscribe" at our web page http://jfs.sourceforge.net/
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 7aaf09b..4f3e84c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -122,21 +122,22 @@ subdirectory has the entries listed in Table 1-1.
Table 1-1: Process specific entries in /proc
..............................................................................
- File Content
- cmdline Command line arguments
- cpu Current and last cpu in which it was executed (2.4)(smp)
- cwd Link to the current working directory
- environ Values of environment variables
- exe Link to the executable of this process
- fd Directory, which contains all file descriptors
- maps Memory maps to executables and library files (2.4)
- mem Memory held by this process
- root Link to the root directory of this process
- stat Process status
- statm Process memory status information
- status Process status in human readable form
- wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
- smaps Extension based on maps, presenting the rss size for each mapped file
+ File Content
+ clear_refs Clears page referenced bits shown in smaps output
+ cmdline Command line arguments
+ cpu Current and last cpu in which it was executed (2.4)(smp)
+ cwd Link to the current working directory
+ environ Values of environment variables
+ exe Link to the executable of this process
+ fd Directory, which contains all file descriptors
+ maps Memory maps to executables and library files (2.4)
+ mem Memory held by this process
+ root Link to the root directory of this process
+ stat Process status
+ statm Process memory status information
+ status Process status in human readable form
+ wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ smaps Extension based on maps, the rss size for each mapped file
..............................................................................
For example, to get the status information of a process, all you have to do is
@@ -1137,6 +1138,13 @@ determine whether or not they are still functioning properly.
Because the NMI watchdog shares registers with oprofile, by disabling the NMI
watchdog, oprofile may have more registers to utilize.
+maps_protect
+------------
+
+Enables/Disables the protection of the per-process proc entries "maps" and
+"smaps". When enabled, the contents of these files are visible only to
+readers that are allowed to ptrace() the given process.
+
2.4 /proc/sys/vm - The virtual memory subsystem
-----------------------------------------------
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 069cb10..fcc123f 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -57,6 +57,13 @@ nonumtail=<bool> -- When creating 8.3 aliases, normally the alias will
currently exist in the directory, 'longfile.txt' will
be the short alias instead of 'longfi~1.txt'.
+usefree -- Use the "free clusters" value stored on FSINFO. It'll
+ be used to determine number of free clusters without
+ scanning disk. But it's not used by default, because
+ recent Windows don't update it correctly in some
+ case. If you are sure the "free clusters" on FSINFO is
+ correct, by this option you can avoid scanning disk.
+
quiet -- Stops printing certain warning messages.
check=s|r|n -- Case sensitivity checking setting.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ea271f2..a47cc81 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -827,7 +827,7 @@ This describes how a filesystem can overload the standard dentry
operations. Dentries and the dcache are the domain of the VFS and the
individual filesystem implementations. Device drivers have no business
here. These methods may be set to NULL, as they are either optional or
-the VFS uses a default. As of kernel 2.6.13, the following members are
+the VFS uses a default. As of kernel 2.6.22, the following members are
defined:
struct dentry_operations {
@@ -837,6 +837,7 @@ struct dentry_operations {
int (*d_delete)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
+ char *(*d_dname)(struct dentry *, char *, int);
};
d_revalidate: called when the VFS needs to revalidate a dentry. This
@@ -859,6 +860,26 @@ struct dentry_operations {
VFS calls iput(). If you define this method, you must call
iput() yourself
+ d_dname: called when the pathname of a dentry should be generated.
+ Usefull for some pseudo filesystems (sockfs, pipefs, ...) to delay
+ pathname generation. (Instead of doing it when dentry is created,
+ its done only when the path is needed.). Real filesystems probably
+ dont want to use it, because their dentries are present in global
+ dcache hash, so their hash should be an invariant. As no lock is
+ held, d_dname() should not try to modify the dentry itself, unless
+ appropriate SMP safety is used. CAUTION : d_path() logic is quite
+ tricky. The correct way to return for example "Hello" is to put it
+ at the end of the buffer, and returns a pointer to the first char.
+ dynamic_dname() helper function is provided to take care of this.
+
+Example :
+
+static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
+{
+ return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+ dentry->d_inode->i_ino);
+}
+
Each dentry has a pointer to its parent dentry, as well as a hash list
of child dentries. Child dentries are basically like files in a
directory.
diff --git a/Documentation/hwmon/coretemp b/Documentation/hwmon/coretemp
new file mode 100644
index 0000000..870cda9
--- /dev/null
+++ b/Documentation/hwmon/coretemp
@@ -0,0 +1,36 @@
+Kernel driver coretemp
+======================
+
+Supported chips:
+ * All Intel Core family
+ Prefix: 'coretemp'
+ CPUID: family 0x6, models 0xe, 0xf
+ Datasheet: Intel 64 and IA-32 Architectures Software Developer's Manual
+ Volume 3A: System Programming Guide
+
+Author: Rudolf Marek
+
+Description
+-----------
+
+This driver permits reading temperature sensor embedded inside Intel Core CPU.
+Temperature is measured in degrees Celsius and measurement resolution is
+1 degree C. Valid temperatures are from 0 to TjMax degrees C, because
+the actual value of temperature register is in fact a delta from TjMax.
+
+Temperature known as TjMax is the maximum junction temperature of processor.
+Intel defines this temperature as 85C or 100C. At this temperature, protection
+mechanism will perform actions to forcibly cool down the processor. Alarm
+may be raised, if the temperature grows enough (more than TjMax) to trigger
+the Out-Of-Spec bit. Following table summarizes the exported sysfs files:
+
+temp1_input - Core temperature (in millidegrees Celsius).
+temp1_crit - Maximum junction temperature (in millidegrees Celsius).
+temp1_crit_alarm - Set when Out-of-spec bit is set, never clears.
+ Correct CPU operation is no longer guaranteed.
+temp1_label - Contains string "Core X", where X is processor
+ number.
+
+The TjMax temperature is set to 85 degrees C if undocumented model specific
+register (UMSR) 0xee has bit 30 set. If not the TjMax is 100 degrees C as
+(sometimes) documented in processor datasheet.
diff --git a/Documentation/hwmon/max6650 b/Documentation/hwmon/max6650
new file mode 100644
index 0000000..8be7beb
--- /dev/null
+++ b/Documentation/hwmon/max6650
@@ -0,0 +1,53 @@
+Kernel driver max6650
+=====================
+
+Supported chips:
+ * Maxim 6650 / 6651
+ Prefix: 'max6650'
+ Addresses scanned: I2C 0x1b, 0x1f, 0x48, 0x4b
+ Datasheet: http://pdfserv.maxim-ic.com/en/ds/MAX6650-MAX6651.pdf
+
+Authors:
+ Hans J. Koch <hjk@linutronix.de>
+ John Morris <john.morris@spirentcom.com>
+ Claus Gindhart <claus.gindhart@kontron.com>
+
+Description
+-----------
+
+This driver implements support for the Maxim 6650/6651
+
+The 2 devices are very similar, but the Maxim 6550 has a reduced feature
+set, e.g. only one fan-input, instead of 4 for the 6651.
+
+The driver is not able to distinguish between the 2 devices.
+
+The driver provides the following sensor accesses in sysfs:
+
+fan1_input ro fan tachometer speed in RPM
+fan2_input ro "
+fan3_input ro "
+fan4_input ro "
+fan1_target rw desired fan speed in RPM (closed loop mode only)
+pwm1_enable rw regulator mode, 0=full on, 1=open loop, 2=closed loop
+pwm1 rw relative speed (0-255), 255=max. speed.
+ Used in open loop mode only.
+fan1_div rw sets the speed range the inputs can handle. Legal
+ values are 1, 2, 4, and 8. Use lower values for
+ faster fans.
+
+Module parameters
+-----------------
+
+If your board has a BIOS that initializes the MAX6650/6651 correctly, you can
+simply load your module without parameters. It won't touch the configuration
+registers then. If your board BIOS doesn't initialize the chip, or you want
+different settings, you can set the following parameters:
+
+voltage_12V: 5=5V fan, 12=12V fan, 0=don't change
+prescaler: Possible values are 1,2,4,8,16, or 0 for don't change
+clock: The clock frequency in Hz of the chip the driver should assume [254000]
+
+Please have a look at the MAX6650/6651 data sheet and make sure that you fully
+understand the meaning of these parameters before you attempt to change them.
+
diff --git a/Documentation/hwmon/smsc47m1 b/Documentation/hwmon/smsc47m1
index 04a1112..42c8431 100644
--- a/Documentation/hwmon/smsc47m1
+++ b/Documentation/hwmon/smsc47m1
@@ -14,6 +14,10 @@ Supported chips:
http://www.smsc.com/main/datasheets/47m14x.pdf
http://www.smsc.com/main/tools/discontinued/47m15x.pdf
http://www.smsc.com/main/datasheets/47m192.pdf
+ * SMSC LPC47M292
+ Addresses scanned: none, address read from Super I/O config space
+ Prefix: 'smsc47m2'
+ Datasheet: Not public
* SMSC LPC47M997
Addresses scanned: none, address read from Super I/O config space
Prefix: 'smsc47m1'
@@ -32,9 +36,10 @@ Description
The Standard Microsystems Corporation (SMSC) 47M1xx Super I/O chips
contain monitoring and PWM control circuitry for two fans.
-The 47M15x and 47M192 chips contain a full 'hardware monitoring block'
-in addition to the fan monitoring and control. The hardware monitoring
-block is not supported by the driver.
+The LPC47M15x, LPC47M192 and LPC47M292 chips contain a full 'hardware
+monitoring block' in addition to the fan monitoring and control. The
+hardware monitoring block is not supported by this driver, use the
+smsc47m192 driver for that.
No documentation is available for the 47M997, but it has the same device
ID as the 47M15x and 47M192 chips and seems to be compatible.
diff --git a/Documentation/hwmon/smsc47m192 b/Documentation/hwmon/smsc47m192
index 45d6453..6d54ecb 100644
--- a/Documentation/hwmon/smsc47m192
+++ b/Documentation/hwmon/smsc47m192
@@ -2,12 +2,13 @@ Kernel driver smsc47m192
========================
Supported chips:
- * SMSC LPC47M192 and LPC47M997
+ * SMSC LPC47M192, LPC47M15x, LPC47M292 and LPC47M997
Prefix: 'smsc47m192'
Addresses scanned: I2C 0x2c - 0x2d
Datasheet: The datasheet for LPC47M192 is publicly available from
http://www.smsc.com/
- The LPC47M997 is compatible for hardware monitoring.
+ The LPC47M15x, LPC47M292 and LPC47M997 are compatible for
+ hardware monitoring.
Author: Hartmut Rick <linux@rick.claranet.de>
Special thanks to Jean Delvare for careful checking
@@ -18,7 +19,7 @@ Description
-----------
This driver implements support for the hardware sensor capabilities
-of the SMSC LPC47M192 and LPC47M997 Super-I/O chips.
+of the SMSC LPC47M192 and compatible Super-I/O chips.
These chips support 3 temperature channels and 8 voltage inputs
as well as CPU voltage VID input.
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index d73d2e8..a9a18ad 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -152,6 +152,13 @@ fan[1-*]_div Fan divisor.
Note that this is actually an internal clock divisor, which
affects the measurable speed range, not the read value.
+fan[1-*]_target
+ Desired fan speed
+ Unit: revolution/min (RPM)
+ RW
+ Only makes sense if the chip supports closed-loop fan speed
+ control based on the measured fan speed.
+
Also see the Alarms section for status flags associated with fans.
diff --git a/Documentation/ia64/aliasing-test.c b/Documentation/ia64/aliasing-test.c
new file mode 100644
index 0000000..3153167
--- /dev/null
+++ b/Documentation/ia64/aliasing-test.c
@@ -0,0 +1,247 @@
+/*
+ * Exercise /dev/mem mmap cases that have been troublesome in the past
+ *
+ * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
+ * Bjorn Helgaas <bjorn.helgaas@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+int sum;
+
+int map_mem(char *path, off_t offset, size_t length, int touch)
+{
+ int fd, rc;
+ void *addr;
+ int *c;
+
+ fd = open(path, O_RDWR);
+ if (fd == -1) {
+ perror(path);
+ return -1;
+ }
+
+ addr = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset);
+ if (addr == MAP_FAILED)
+ return 1;
+
+ if (touch) {
+ c = (int *) addr;
+ while (c < (int *) (offset + length))
+ sum += *c++;
+ }
+
+ rc = munmap(addr, length);
+ if (rc == -1) {
+ perror("munmap");
+ return -1;
+ }
+
+ close(fd);
+ return 0;
+}
+
+int scan_sysfs(char *path, char *file, off_t offset, size_t length, int touch)
+{
+ struct dirent **namelist;
+ char *name, *path2;
+ int i, n, r, rc, result = 0;
+ struct stat buf;
+
+ n = scandir(path, &namelist, 0, alphasort);
+ if (n < 0) {
+ perror("scandir");
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ name = namelist[i]->d_name;
+
+ if (fnmatch(".", name, 0) == 0)
+ goto skip;
+ if (fnmatch("..", name, 0) == 0)
+ goto skip;
+
+ path2 = malloc(strlen(path) + strlen(name) + 3);
+ strcpy(path2, path);
+ strcat(path2, "/");
+ strcat(path2, name);
+
+ if (fnmatch(file, name, 0) == 0) {
+ rc = map_mem(path2, offset, length, touch);
+ if (rc == 0)
+ fprintf(stderr, "PASS: %s 0x%lx-0x%lx is %s\n", path2, offset, offset + length, touch ? "readable" : "mappable");
+ else if (rc > 0)
+ fprintf(stderr, "PASS: %s 0x%lx-0x%lx not mappable\n", path2, offset, offset + length);
+ else {
+ fprintf(stderr, "FAIL: %s 0x%lx-0x%lx not accessible\n", path2, offset, offset + length);
+ return rc;
+ }
+ } else {
+ r = lstat(path2, &buf);
+ if (r == 0 && S_ISDIR(buf.st_mode)) {
+ rc = scan_sysfs(path2, file, offset, length, touch);
+ if (rc < 0)
+ return rc;
+ }
+ }
+
+ result |= rc;
+ free(path2);
+
+skip:
+ free(namelist[i]);
+ }
+ free(namelist);
+ return rc;
+}
+
+char buf[1024];
+
+int read_rom(char *path)
+{
+ int fd, rc;
+ size_t size = 0;
+
+ fd = open(path, O_RDWR);
+ if (fd == -1) {
+ perror(path);
+ return -1;
+ }
+
+ rc = write(fd, "1", 2);
+ if (rc <= 0) {
+ perror("write");
+ return -1;
+ }
+
+ do {
+ rc = read(fd, buf, sizeof(buf));
+ if (rc > 0)
+ size += rc;
+ } while (rc > 0);
+
+ close(fd);
+ return size;
+}
+
+int scan_rom(char *path, char *file)
+{
+ struct dirent **namelist;
+ char *name, *path2;
+ int i, n, r, rc, result = 0;
+ struct stat buf;
+
+ n = scandir(path, &namelist, 0, alphasort);
+ if (n < 0) {
+ perror("scandir");
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ name = namelist[i]->d_name;
+
+ if (fnmatch(".", name, 0) == 0)
+ goto skip;
+ if (fnmatch("..", name, 0) == 0)
+ goto skip;
+
+ path2 = malloc(strlen(path) + strlen(name) + 3);
+ strcpy(path2, path);
+ strcat(path2, "/");
+ strcat(path2, name);
+
+ if (fnmatch(file, name, 0) == 0) {
+ rc = read_rom(path2);
+
+ /*
+ * It's OK if the ROM is unreadable. Maybe there
+ * is no ROM, or some other error ocurred. The
+ * important thing is that no MCA happened.
+ */
+ if (rc > 0)
+ fprintf(stderr, "PASS: %s read %ld bytes\n", path2, rc);
+ else {
+ fprintf(stderr, "PASS: %s not readable\n", path2);
+ return rc;
+ }
+ } else {
+ r = lstat(path2, &buf);
+ if (r == 0 && S_ISDIR(buf.st_mode)) {
+ rc = scan_rom(path2, file);
+ if (rc < 0)
+ return rc;
+ }
+ }
+
+ result |= rc;
+ free(path2);
+
+skip:
+ free(namelist[i]);
+ }
+ free(namelist);
+ return rc;
+}
+
+main()
+{
+ int rc;
+
+ if (map_mem("/dev/mem", 0, 0xA0000, 1) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0xa0000 is readable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0x0-0xa0000 not accessible\n");
+
+ /*
+ * It's not safe to blindly read the VGA frame buffer. If you know
+ * how to poke the card the right way, it should respond, but it's
+ * not safe in general. Many machines, e.g., Intel chipsets, cover
+ * up a non-responding card by just returning -1, but others will
+ * report the failure as a machine check.
+ */
+ if (map_mem("/dev/mem", 0xA0000, 0x20000, 0) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0xa0000-0xc0000 is mappable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0xa0000-0xc0000 not accessible\n");
+
+ if (map_mem("/dev/mem", 0xC0000, 0x40000, 1) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0xc0000-0x100000 is readable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0xc0000-0x100000 not accessible\n");
+
+ /*
+ * Often you can map all the individual pieces above (0-0xA0000,
+ * 0xA0000-0xC0000, and 0xC0000-0x100000), but can't map the whole
+ * thing at once. This is because the individual pieces use different
+ * attributes, and there's no single attribute supported over the
+ * whole region.
+ */
+ rc = map_mem("/dev/mem", 0, 1024*1024, 0);
+ if (rc == 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 is mappable\n");
+ else if (rc > 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 not mappable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0x0-0x100000 not accessible\n");
+
+ scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0, 0xA0000, 1);
+ scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0xA0000, 0x20000, 0);
+ scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0xC0000, 0x40000, 1);
+ scan_sysfs("/sys/class/pci_bus", "legacy_mem", 0, 1024*1024, 0);
+
+ scan_rom("/sys/devices", "rom");
+}
diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt
index 38f9a52..9a431a7 100644
--- a/Documentation/ia64/aliasing.txt
+++ b/Documentation/ia64/aliasing.txt
@@ -112,16 +112,6 @@ POTENTIAL ATTRIBUTE ALIASING CASES
The /dev/mem mmap constraints apply.
- However, since this is for mapping legacy MMIO space, WB access
- does not make sense. This matters on machines without legacy
- VGA support: these machines may have WB memory for the entire
- first megabyte (or even the entire first granule).
-
- On these machines, we could mmap legacy_mem as WB, which would
- be safe in terms of attribute aliasing, but X has no way of
- knowing that it is accessing regular memory, not a frame buffer,
- so the kernel should fail the mmap rather than doing it with WB.
-
read/write of /dev/mem
This uses copy_from_user(), which implicitly uses a kernel
@@ -138,14 +128,20 @@ POTENTIAL ATTRIBUTE ALIASING CASES
ioremap()
- This returns a kernel identity mapping for use inside the
- kernel.
+ This returns a mapping for use inside the kernel.
If the region is in kern_memmap, we should use the attribute
- specified there. Otherwise, if the EFI memory map reports that
- the entire granule supports WB, we should use that (granules
- that are partially reserved or occupied by firmware do not appear
- in kern_memmap). Otherwise, we should use a UC mapping.
+ specified there.
+
+ If the EFI memory map reports that the entire granule supports
+ WB, we should use that (granules that are partially reserved
+ or occupied by firmware do not appear in kern_memmap).
+
+ If the granule contains non-WB memory, but we can cover the
+ region safely with kernel page table mappings, we can use
+ ioremap_page_range() as most other architectures do.
+
+ Failing all of the above, we have to fall back to a UC mapping.
PAST PROBLEM CASES
@@ -158,7 +154,7 @@ PAST PROBLEM CASES
succeed. It may create either WB or UC user mappings, depending
on whether the region is in kern_memmap or the EFI memory map.
- mmap of 0x0-0xA0000 /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
+ mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
See https://bugzilla.novell.com/show_bug.cgi?id=140858.
@@ -171,28 +167,25 @@ PAST PROBLEM CASES
so it is safe to use WB mappings.
The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
- which will use a granule-sized UC mapping covering 0-0xFFFFF. This
- granule covers some WB-only memory, but since UC is non-speculative,
- the processor will never generate an uncacheable reference to the
- WB-only areas unless the driver explicitly touches them.
+ which uses a granule-sized UC mapping. This granule will cover some
+ WB-only memory, but since UC is non-speculative, the processor will
+ never generate an uncacheable reference to the WB-only areas unless
+ the driver explicitly touches them.
mmap of 0x0-0xFFFFF legacy_mem by "X"
- If the EFI memory map reports this entire range as WB, there
- is no VGA MMIO hole, and the mmap should fail or be done with
- a WB mapping.
+ If the EFI memory map reports that the entire range supports the
+ same attributes, we can allow the mmap (and we will prefer WB if
+ supported, as is the case with HP sx[12]000 machines with VGA
+ disabled).
- There's no easy way for X to determine whether the 0xA0000-0xBFFFF
- region is a frame buffer or just memory, so I think it's best to
- just fail this mmap request rather than using a WB mapping. As
- far as I know, there's no need to map legacy_mem with WB
- mappings.
+ If EFI reports the range as partly WB and partly UC (as on sx[12]000
+ machines with VGA enabled), we must fail the mmap because there's no
+ safe attribute to use.
- Otherwise, a UC mapping of the entire region is probably safe.
- The VGA hole means the region will not be in kern_memmap. The
- HP sx1000 chipset doesn't support UC access to the memory surrounding
- the VGA hole, but X doesn't need that area anyway and should not
- reference it.
+ If EFI reports some of the range but not all (as on Intel firmware
+ that doesn't report the VGA frame buffer at all), we should fail the
+ mmap and force the user to map just the specific region of interest.
mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
@@ -202,6 +195,16 @@ PAST PROBLEM CASES
This is a special case of the previous case, and the mmap should
fail for the same reason as above.
+ read of /sys/devices/.../rom
+
+ For VGA devices, this may cause an ioremap() of 0xC0000. This
+ used to be done with a UC mapping, because the VGA frame buffer
+ at 0xA0000 prevents use of a WB granule. The UC mapping causes
+ an MCA on HP sx[12]000 chipsets.
+
+ We should use WB page table mappings to avoid covering the VGA
+ frame buffer.
+
NOTES
[1] SDM rev 2.2, vol 2, sec 4.4.1.
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt
new file mode 100644
index 0000000..6449a70
--- /dev/null
+++ b/Documentation/ia64/err_inject.txt
@@ -0,0 +1,1068 @@
+
+IPF Machine Check (MC) error inject tool
+========================================
+
+IPF Machine Check (MC) error inject tool is used to inject MC
+errors from Linux. The tool is a test bed for IPF MC work flow including
+hardware correctable error handling, OS recoverable error handling, MC
+event logging, etc.
+
+The tool includes two parts: a kernel driver and a user application
+sample. The driver provides interface to PAL to inject error
+and query error injection capabilities. The driver code is in
+arch/ia64/kernel/err_inject.c. The application sample (shown below)
+provides a combination of various errors and calls the driver's interface
+(sysfs interface) to inject errors or query error injection capabilities.
+
+The tool can be used to test Intel IPF machine MC handling capabilities.
+It's especially useful for people who can not access hardware MC injection
+tool to inject error. It's also very useful to integrate with other
+software test suits to do stressful testing on IPF.
+
+Below is a sample application as part of the whole tool. The sample
+can be used as a working test tool. Or it can be expanded to include
+more features. It also can be a integrated into a libary or other user
+application to have more thorough test.
+
+The sample application takes err.conf as error configuation input. Gcc
+compiles the code. After you install err_inject driver, you can run
+this sample application to inject errors.
+
+Errata: Itanium 2 Processors Specification Update lists some errata against
+the pal_mc_error_inject PAL procedure. The following err.conf has been tested
+on latest Montecito PAL.
+
+err.conf:
+
+#This is configuration file for err_inject_tool.
+#The format of the each line is:
+#cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer
+#where
+# cpu: logical cpu number the error will be inject in.
+# loop: times the error will be injected.
+# interval: In second. every so often one error is injected.
+# err_type_info, err_struct_info: PAL parameters.
+#
+#Note: All values are hex w/o or w/ 0x prefix.
+
+
+#On cpu2, inject only total 0x10 errors, interval 5 seconds
+#corrected, data cache, hier-2, physical addr(assigned by tool code).
+#working on Montecito latest PAL.
+2, 10, 5, 4101, 95
+
+#On cpu4, inject and consume total 0x10 errors, interval 5 seconds
+#corrected, data cache, hier-2, physical addr(assigned by tool code).
+#working on Montecito latest PAL.
+4, 10, 5, 4109, 95
+
+#On cpu15, inject and consume total 0x10 errors, interval 5 seconds
+#recoverable, DTR0, hier-2.
+#working on Montecito latest PAL.
+0xf, 0x10, 5, 4249, 15
+
+The sample application source code:
+
+err_injection_tool.c:
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Copyright (C) 2006 Intel Co
+ * Fenghua Yu <fenghua.yu@intel.com>
+ *
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+
+#define MAX_FN_SIZE 256
+#define MAX_BUF_SIZE 256
+#define DATA_BUF_SIZE 256
+#define NR_CPUS 512
+#define MAX_TASK_NUM 2048
+#define MIN_INTERVAL 5 // seconds
+#define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte.
+#define PARA_FIELD_NUM 5
+#define MASK_SIZE (NR_CPUS/64)
+#define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/"
+
+int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask);
+
+int verbose;
+#define vbprintf if (verbose) printf
+
+int log_info(int cpu, const char *fmt, ...)
+{
+ FILE *log;
+ char fn[MAX_FN_SIZE];
+ char buf[MAX_BUF_SIZE];
+ va_list args;
+
+ sprintf(fn, "%d.log", cpu);
+ log=fopen(fn, "a+");
+ if (log==NULL) {
+ perror("Error open:");
+ return -1;
+ }
+
+ va_start(args, fmt);
+ vprintf(fmt, args);
+ memset(buf, 0, MAX_BUF_SIZE);
+ vsprintf(buf, fmt, args);
+ va_end(args);
+
+ fwrite(buf, sizeof(buf), 1, log);
+ fclose(log);
+
+ return 0;
+}
+
+typedef unsigned long u64;
+typedef unsigned int u32;
+
+typedef union err_type_info_u {
+ struct {
+ u64 mode : 3, /* 0-2 */
+ err_inj : 3, /* 3-5 */
+ err_sev : 2, /* 6-7 */
+ err_struct : 5, /* 8-12 */
+ struct_hier : 3, /* 13-15 */
+ reserved : 48; /* 16-63 */
+ } err_type_info_u;
+ u64 err_type_info;
+} err_type_info_t;
+
+typedef union err_struct_info_u {
+ struct {
+ u64 siv : 1, /* 0 */
+ c_t : 2, /* 1-2 */
+ cl_p : 3, /* 3-5 */
+ cl_id : 3, /* 6-8 */
+ cl_dp : 1, /* 9 */
+ reserved1 : 22, /* 10-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_cache;
+ struct {
+ u64 siv : 1, /* 0 */
+ tt : 2, /* 1-2 */
+ tc_tr : 2, /* 3-4 */
+ tr_slot : 8, /* 5-12 */
+ reserved1 : 19, /* 13-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_tlb;
+ struct {
+ u64 siv : 1, /* 0 */
+ regfile_id : 4, /* 1-4 */
+ reg_num : 7, /* 5-11 */
+ reserved1 : 20, /* 12-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_register;
+ struct {
+ u64 reserved;
+ } err_struct_info_bus_processor_interconnect;
+ u64 err_struct_info;
+} err_struct_info_t;
+
+typedef union err_data_buffer_u {
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ u64 inj_addr; /* 64-127 */
+ u64 way : 5, /* 128-132 */
+ index : 20, /* 133-152 */
+ : 39; /* 153-191 */
+ } err_data_buffer_cache;
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ u64 inj_addr; /* 64-127 */
+ u64 way : 5, /* 128-132 */
+ index : 20, /* 133-152 */
+ reserved : 39; /* 153-191 */
+ } err_data_buffer_tlb;
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ } err_data_buffer_register;
+ struct {
+ u64 reserved; /* 0-63 */
+ } err_data_buffer_bus_processor_interconnect;
+ u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
+} err_data_buffer_t;
+
+typedef union capabilities_u {
+ struct {
+ u64 i : 1,
+ d : 1,
+ rv : 1,
+ tag : 1,
+ data : 1,
+ mesi : 1,
+ dp : 1,
+ reserved1 : 3,
+ pa : 1,
+ va : 1,
+ wi : 1,
+ reserved2 : 20,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved3 : 30;
+ } capabilities_cache;
+ struct {
+ u64 d : 1,
+ i : 1,
+ rv : 1,
+ tc : 1,
+ tr : 1,
+ reserved1 : 27,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved2 : 30;
+ } capabilities_tlb;
+ struct {
+ u64 gr_b0 : 1,
+ gr_b1 : 1,
+ fr : 1,
+ br : 1,
+ pr : 1,
+ ar : 1,
+ cr : 1,
+ rr : 1,
+ pkr : 1,
+ dbr : 1,
+ ibr : 1,
+ pmc : 1,
+ pmd : 1,
+ reserved1 : 3,
+ regnum : 1,
+ reserved2 : 15,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved3 : 30;
+ } capabilities_register;
+ struct {
+ u64 reserved;
+ } capabilities_bus_processor_interconnect;
+} capabilities_t;
+
+typedef struct resources_s {
+ u64 ibr0 : 1,
+ ibr2 : 1,
+ ibr4 : 1,
+ ibr6 : 1,
+ dbr0 : 1,
+ dbr2 : 1,
+ dbr4 : 1,
+ dbr6 : 1,
+ reserved : 48;
+} resources_t;
+
+
+long get_page_size(void)
+{
+ long page_size=sysconf(_SC_PAGESIZE);
+ return page_size;
+}
+
+#define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size())
+#define SHM_SIZE (2*PAGE_SIZE*NR_CPUS)
+#define SHM_VA 0x2000000100000000
+
+int shmid;
+void *shmaddr;
+
+int create_shm(void)
+{
+ key_t key;
+ char fn[MAX_FN_SIZE];
+
+ /* cpu0 is always existing */
+ sprintf(fn, PATH_FORMAT, 0);
+ if ((key = ftok(fn, 's')) == -1) {
+ perror("ftok");
+ return -1;
+ }
+
+ shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT);
+ if (shmid == -1) {
+ if (errno==EEXIST) {
+ shmid = shmget(key, SHM_SIZE, 0);
+ if (shmid == -1) {
+ perror("shmget");
+ return -1;
+ }
+ }
+ else {
+ perror("shmget");
+ return -1;
+ }
+ }
+ vbprintf("shmid=%d", shmid);
+
+ /* connect to the segment: */
+ shmaddr = shmat(shmid, (void *)SHM_VA, 0);
+ if (shmaddr == (void*)-1) {
+ perror("shmat");
+ return -1;
+ }
+
+ memset(shmaddr, 0, SHM_SIZE);
+ mlock(shmaddr, SHM_SIZE);
+
+ return 0;
+}
+
+int free_shm()
+{
+ munlock(shmaddr, SHM_SIZE);
+ shmdt(shmaddr);
+ semctl(shmid, 0, IPC_RMID);
+
+ return 0;
+}
+
+#ifdef _SEM_SEMUN_UNDEFINED
+union semun
+{
+ int val;
+ struct semid_ds *buf;
+ unsigned short int *array;
+ struct seminfo *__buf;
+};
+#endif
+
+u32 mode=1; /* 1: physical mode; 2: virtual mode. */
+int one_lock=1;
+key_t key[NR_CPUS];
+int semid[NR_CPUS];
+
+int create_sem(int cpu)
+{
+ union semun arg;
+ char fn[MAX_FN_SIZE];
+ int sid;
+
+ sprintf(fn, PATH_FORMAT, cpu);
+ sprintf(fn, "%s/%s", fn, "err_type_info");
+ if ((key[cpu] = ftok(fn, 'e')) == -1) {
+ perror("ftok");
+ return -1;
+ }
+
+ if (semid[cpu]!=0)
+ return 0;
+
+ /* clear old semaphore */
+ if ((sid = semget(key[cpu], 1, 0)) != -1)
+ semctl(sid, 0, IPC_RMID);
+
+ /* get one semaphore */
+ if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) {
+ perror("semget");
+ printf("Please remove semaphore with key=0x%lx, then run the tool.\n",
+ (u64)key[cpu]);
+ return -1;
+ }
+
+ vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu,
+ (u64)key[cpu]);
+ /* initialize the semaphore to 1: */
+ arg.val = 1;
+ if (semctl(semid[cpu], 0, SETVAL, arg) == -1) {
+ perror("semctl");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int lock(int cpu)
+{
+ struct sembuf lock;
+
+ lock.sem_num = cpu;
+ lock.sem_op = 1;
+ semop(semid[cpu], &lock, 1);
+
+ return 0;
+}
+
+static int unlock(int cpu)
+{
+ struct sembuf unlock;
+
+ unlock.sem_num = cpu;
+ unlock.sem_op = -1;
+ semop(semid[cpu], &unlock, 1);
+
+ return 0;
+}
+
+void free_sem(int cpu)
+{
+ semctl(semid[cpu], 0, IPC_RMID);
+}
+
+int wr_multi(char *fn, unsigned long *data, int size)
+{
+ int fd;
+ char buf[MAX_BUF_SIZE];
+ int ret;
+
+ if (size==1)
+ sprintf(buf, "%lx", *data);
+ else if (size==3)
+ sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]);
+ else {
+ fprintf(stderr,"write to file with wrong size!\n");
+ return -1;
+ }
+
+ fd=open(fn, O_RDWR);
+ if (!fd) {
+ perror("Error:");
+ return -1;
+ }
+ ret=write(fd, buf, sizeof(buf));
+ close(fd);
+ return ret;
+}
+
+int wr(char *fn, unsigned long data)
+{
+ return wr_multi(fn, &data, 1);
+}
+
+int rd(char *fn, unsigned long *data)
+{
+ int fd;
+ char buf[MAX_BUF_SIZE];
+
+ fd=open(fn, O_RDONLY);
+ if (fd<0) {
+ perror("Error:");
+ return -1;
+ }
+ read(fd, buf, MAX_BUF_SIZE);
+ *data=strtoul(buf, NULL, 16);
+ close(fd);
+ return 0;
+}
+
+int rd_status(char *path, int *status)
+{
+ char fn[MAX_FN_SIZE];
+ sprintf(fn, "%s/status", path);
+ if (rd(fn, (u64*)status)<0) {
+ perror("status reading error.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int rd_capabilities(char *path, u64 *capabilities)
+{
+ char fn[MAX_FN_SIZE];
+ sprintf(fn, "%s/capabilities", path);
+ if (rd(fn, capabilities)<0) {
+ perror("capabilities reading error.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int rd_all(char *path)
+{
+ unsigned long err_type_info, err_struct_info, err_data_buffer;
+ int status;
+ unsigned long capabilities, resources;
+ char fn[MAX_FN_SIZE];
+
+ sprintf(fn, "%s/err_type_info", path);
+ if (rd(fn, &err_type_info)<0) {
+ perror("err_type_info reading error.\n");
+ return -1;
+ }
+ printf("err_type_info=%lx\n", err_type_info);
+
+ sprintf(fn, "%s/err_struct_info", path);
+ if (rd(fn, &err_struct_info)<0) {
+ perror("err_struct_info reading error.\n");
+ return -1;
+ }
+ printf("err_struct_info=%lx\n", err_struct_info);
+
+ sprintf(fn, "%s/err_data_buffer", path);
+ if (rd(fn, &err_data_buffer)<0) {
+ perror("err_data_buffer reading error.\n");
+ return -1;
+ }
+ printf("err_data_buffer=%lx\n", err_data_buffer);
+
+ sprintf(fn, "%s/status", path);
+ if (rd("status", (u64*)&status)<0) {
+ perror("status reading error.\n");
+ return -1;
+ }
+ printf("status=%d\n", status);
+
+ sprintf(fn, "%s/capabilities", path);
+ if (rd(fn,&capabilities)<0) {
+ perror("capabilities reading error.\n");
+ return -1;
+ }
+ printf("capabilities=%lx\n", capabilities);
+
+ sprintf(fn, "%s/resources", path);
+ if (rd(fn, &resources)<0) {
+ perror("resources reading error.\n");
+ return -1;
+ }
+ printf("resources=%lx\n", resources);
+
+ return 0;
+}
+
+int query_capabilities(char *path, err_type_info_t err_type_info,
+ u64 *capabilities)
+{
+ char fn[MAX_FN_SIZE];
+ err_struct_info_t err_struct_info;
+ err_data_buffer_t err_data_buffer;
+
+ err_struct_info.err_struct_info=0;
+ memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8);
+
+ sprintf(fn, "%s/err_type_info", path);
+ wr(fn, err_type_info.err_type_info);
+ sprintf(fn, "%s/err_struct_info", path);
+ wr(fn, 0x0);
+ sprintf(fn, "%s/err_data_buffer", path);
+ wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
+
+ // Fire pal_mc_error_inject procedure.
+ sprintf(fn, "%s/call_start", path);
+ wr(fn, mode);
+
+ if (rd_capabilities(path, capabilities)<0)
+ return -1;
+
+ return 0;
+}
+
+int query_all_capabilities()
+{
+ int status;
+ err_type_info_t err_type_info;
+ int err_sev, err_struct, struct_hier;
+ int cap=0;
+ u64 capabilities;
+ char path[MAX_FN_SIZE];
+
+ err_type_info.err_type_info=0; // Initial
+ err_type_info.err_type_info_u.mode=0; // Query mode;
+ err_type_info.err_type_info_u.err_inj=0;
+
+ printf("All capabilities implemented in pal_mc_error_inject:\n");
+ sprintf(path, PATH_FORMAT ,0);
+ for (err_sev=0;err_sev<3;err_sev++)
+ for (err_struct=0;err_struct<5;err_struct++)
+ for (struct_hier=0;struct_hier<5;struct_hier++)
+ {
+ status=-1;
+ capabilities=0;
+ err_type_info.err_type_info_u.err_sev=err_sev;
+ err_type_info.err_type_info_u.err_struct=err_struct;
+ err_type_info.err_type_info_u.struct_hier=struct_hier;
+
+ if (query_capabilities(path, err_type_info, &capabilities)<0)
+ continue;
+
+ if (rd_status(path, &status)<0)
+ continue;
+
+ if (status==0) {
+ cap=1;
+ printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ",
+ err_sev, err_struct, struct_hier);
+ printf("capabilities 0x%lx\n", capabilities);
+ }
+ }
+ if (!cap) {
+ printf("No capabilities supported.\n");
+ return 0;
+ }
+
+ return 0;
+}
+
+int err_inject(int cpu, char *path, err_type_info_t err_type_info,
+ err_struct_info_t err_struct_info,
+ err_data_buffer_t err_data_buffer)
+{
+ int status;
+ char fn[MAX_FN_SIZE];
+
+ log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ",
+ err_type_info.err_type_info,
+ err_struct_info.err_struct_info);
+ log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n",
+ err_data_buffer.err_data_buffer[0],
+ err_data_buffer.err_data_buffer[1],
+ err_data_buffer.err_data_buffer[2]);
+ sprintf(fn, "%s/err_type_info", path);
+ wr(fn, err_type_info.err_type_info);
+ sprintf(fn, "%s/err_struct_info", path);
+ wr(fn, err_struct_info.err_struct_info);
+ sprintf(fn, "%s/err_data_buffer", path);
+ wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
+
+ // Fire pal_mc_error_inject procedure.
+ sprintf(fn, "%s/call_start", path);
+ wr(fn,mode);
+
+ if (rd_status(path, &status)<0) {
+ vbprintf("fail: read status\n");
+ return -100;
+ }
+
+ if (status!=0) {
+ log_info(cpu, "fail: status=%d\n", status);
+ return status;
+ }
+
+ return status;
+}
+
+static int construct_data_buf(char *path, err_type_info_t err_type_info,
+ err_struct_info_t err_struct_info,
+ err_data_buffer_t *err_data_buffer,
+ void *va1)
+{
+ char fn[MAX_FN_SIZE];
+ u64 virt_addr=0, phys_addr=0;
+
+ vbprintf("va1=%lx\n", (u64)va1);
+ memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8);
+
+ switch (err_type_info.err_type_info_u.err_struct) {
+ case 1: // Cache
+ switch (err_struct_info.err_struct_info_cache.cl_id) {
+ case 1: //Virtual addr
+ err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1;
+ break;
+ case 2: //Phys addr
+ sprintf(fn, "%s/virtual_to_phys", path);
+ virt_addr=(u64)va1;
+ if (wr(fn,virt_addr)<0)
+ return -1;
+ rd(fn, &phys_addr);
+ err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr;
+ break;
+ default:
+ printf("Not supported cl_id\n");
+ break;
+ }
+ break;
+ case 2: // TLB
+ break;
+ case 3: // Register file
+ break;
+ case 4: // Bus/system interconnect
+ default:
+ printf("Not supported err_struct\n");
+ break;
+ }
+
+ return 0;
+}
+
+typedef struct {
+ u64 cpu;
+ u64 loop;
+ u64 interval;
+ u64 err_type_info;
+ u64 err_struct_info;
+ u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
+} parameters_t;
+
+parameters_t line_para;
+int para;
+
+static int empty_data_buffer(u64 *err_data_buffer)
+{
+ int empty=1;
+ int i;
+
+ for (i=0;i<ERR_DATA_BUFFER_SIZE; i++)
+ if (err_data_buffer[i]!=-1)
+ empty=0;
+
+ return empty;
+}
+
+int err_inj()
+{
+ err_type_info_t err_type_info;
+ err_struct_info_t err_struct_info;
+ err_data_buffer_t err_data_buffer;
+ int count;
+ FILE *fp;
+ unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf;
+ u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE];
+ int num;
+ int i;
+ char path[MAX_FN_SIZE];
+ parameters_t parameters[MAX_TASK_NUM]={};
+ pid_t child_pid[MAX_TASK_NUM];
+ time_t current_time;
+ int status;
+
+ if (!para) {
+ fp=fopen("err.conf", "r");
+ if (fp==NULL) {
+ perror("Error open err.conf");
+ return -1;
+ }
+
+ num=0;
+ while (!feof(fp)) {
+ char buf[256];
+ memset(buf,0,256);
+ fgets(buf, 256, fp);
+ count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
+ &cpu, &loop, &interval,&err_type_info_conf,
+ &err_struct_info_conf,
+ &err_data_buffer_conf[0],
+ &err_data_buffer_conf[1],
+ &err_data_buffer_conf[2]);
+ if (count!=PARA_FIELD_NUM+3) {
+ err_data_buffer_conf[0]=-1;
+ err_data_buffer_conf[1]=-1;
+ err_data_buffer_conf[2]=-1;
+ count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n",
+ &cpu, &loop, &interval,&err_type_info_conf,
+ &err_struct_info_conf);
+ if (count!=PARA_FIELD_NUM)
+ continue;
+ }
+
+ parameters[num].cpu=cpu;
+ parameters[num].loop=loop;
+ parameters[num].interval= interval>MIN_INTERVAL
+ ?interval:MIN_INTERVAL;
+ parameters[num].err_type_info=err_type_info_conf;
+ parameters[num].err_struct_info=err_struct_info_conf;
+ memcpy(parameters[num++].err_data_buffer,
+ err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ;
+
+ if (num>=MAX_TASK_NUM)
+ break;
+ }
+ }
+ else {
+ parameters[0].cpu=line_para.cpu;
+ parameters[0].loop=line_para.loop;
+ parameters[0].interval= line_para.interval>MIN_INTERVAL
+ ?line_para.interval:MIN_INTERVAL;
+ parameters[0].err_type_info=line_para.err_type_info;
+ parameters[0].err_struct_info=line_para.err_struct_info;
+ memcpy(parameters[0].err_data_buffer,
+ line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ;
+
+ num=1;
+ }
+
+ /* Create semaphore: If one_lock, one semaphore for all processors.
+ Otherwise, one sempaphore for each processor. */
+ if (one_lock) {
+ if (create_sem(0)) {
+ printf("Can not create semaphore...exit\n");
+ free_sem(0);
+ return -1;
+ }
+ }
+ else {
+ for (i=0;i<num;i++) {
+ if (create_sem(parameters[i].cpu)) {
+ printf("Can not create semaphore for cpu%d...exit\n",i);
+ free_sem(parameters[num].cpu);
+ return -1;
+ }
+ }
+ }
+
+ /* Create a shm segment which will be used to inject/consume errors on.*/
+ if (create_shm()==-1) {
+ printf("Error to create shm...exit\n");
+ return -1;
+ }
+
+ for (i=0;i<num;i++) {
+ pid_t pid;
+
+ current_time=time(NULL);
+ log_info(parameters[i].cpu, "\nBegine at %s", ctime(&current_time));
+ log_info(parameters[i].cpu, "Configurations:\n");
+ log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)",
+ parameters[i].cpu,
+ parameters[i].loop,
+ parameters[i].interval);
+ log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n",
+ parameters[i].err_type_info,
+ parameters[i].err_struct_info);
+
+ sprintf(path, PATH_FORMAT, (int)parameters[i].cpu);
+ err_type_info.err_type_info=parameters[i].err_type_info;
+ err_struct_info.err_struct_info=parameters[i].err_struct_info;
+ memcpy(err_data_buffer.err_data_buffer,
+ parameters[i].err_data_buffer,
+ ERR_DATA_BUFFER_SIZE*8);
+
+ pid=fork();
+ if (pid==0) {
+ unsigned long mask[MASK_SIZE];
+ int j, k;
+
+ void *va1, *va2;
+
+ /* Allocate two memory areas va1 and va2 in shm */
+ va1=shmaddr+parameters[i].cpu*PAGE_SIZE;
+ va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE;
+
+ vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2);
+ memset(va1, 0x1, PAGE_SIZE);
+ memset(va2, 0x2, PAGE_SIZE);
+
+ if (empty_data_buffer(err_data_buffer.err_data_buffer))
+ /* If not specified yet, construct data buffer
+ * with va1
+ */
+ construct_data_buf(path, err_type_info,
+ err_struct_info, &err_data_buffer,va1);
+
+ for (j=0;j<MASK_SIZE;j++)
+ mask[j]=0;
+
+ cpu=parameters[i].cpu;
+ k = cpu%64;
+ j = cpu/64;
+ mask[j]=1<<k;
+
+ if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) {
+ perror("Error sched_setaffinity:");
+ return -1;
+ }
+
+ for (j=0; j<parameters[i].loop; j++) {
+ log_info(parameters[i].cpu,"Injection ");
+ log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ",
+
+ parameters[i].cpu,j+1, parameters[i].loop);
+
+ /* Hold the lock */
+ if (one_lock)
+ lock(0);
+ else
+ /* Hold lock on this cpu */
+ lock(parameters[i].cpu);
+
+ if ((status=err_inject(parameters[i].cpu,
+ path, err_type_info,
+ err_struct_info, err_data_buffer))
+ ==0) {
+ /* consume the error for "inject only"*/
+ memcpy(va2, va1, PAGE_SIZE);
+ memcpy(va1, va2, PAGE_SIZE);
+ log_info(parameters[i].cpu,
+ "successful\n");
+ }
+ else {
+ log_info(parameters[i].cpu,"fail:");
+ log_info(parameters[i].cpu,
+ "status=%d\n", status);
+ unlock(parameters[i].cpu);
+ break;
+ }
+ if (one_lock)
+ /* Release the lock */
+ unlock(0);
+ /* Release lock on this cpu */
+ else
+ unlock(parameters[i].cpu);
+
+ if (j < parameters[i].loop-1)
+ sleep(parameters[i].interval);
+ }
+ current_time=time(NULL);
+ log_info(parameters[i].cpu, "Done at %s", ctime(&current_time));
+ return 0;
+ }
+ else if (pid<0) {
+ perror("Error fork:");
+ continue;
+ }
+ child_pid[i]=pid;
+ }
+ for (i=0;i<num;i++)
+ waitpid(child_pid[i], NULL, 0);
+
+ if (one_lock)
+ free_sem(0);
+ else
+ for (i=0;i<num;i++)
+ free_sem(parameters[i].cpu);
+
+ printf("All done.\n");
+
+ return 0;
+}
+
+void help()
+{
+ printf("err_inject_tool:\n");
+ printf("\t-q: query all capabilities. default: off\n");
+ printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n");
+ printf("\t-i: inject errors. default: off\n");
+ printf("\t-l: one lock per cpu. default: one lock for all\n");
+ printf("\t-e: error parameters:\n");
+ printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n");
+ printf("\t\t cpu: logical cpu number the error will be inject in.\n");
+ printf("\t\t loop: times the error will be injected.\n");
+ printf("\t\t interval: In second. every so often one error is injected.\n");
+ printf("\t\t err_type_info, err_struct_info: PAL parameters.\n");
+ printf("\t\t err_data_buffer: PAL parameter. Optional. If not present,\n");
+ printf("\t\t it's constructed by tool automatically. Be\n");
+ printf("\t\t careful to provide err_data_buffer and make\n");
+ printf("\t\t sure it's working with the environment.\n");
+ printf("\t Note:no space between error parameters.\n");
+ printf("\t default: Take error parameters from err.conf instead of command line.\n");
+ printf("\t-v: verbose. default: off\n");
+ printf("\t-h: help\n\n");
+ printf("The tool will take err.conf file as ");
+ printf("input to inject single or multiple errors ");
+ printf("on one or multiple cpus in parallel.\n");
+}
+
+int main(int argc, char **argv)
+{
+ char c;
+ int do_err_inj=0;
+ int do_query_all=0;
+ int count;
+ u32 m;
+
+ /* Default one lock for all cpu's */
+ one_lock=1;
+ while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF)
+ switch (c) {
+ case 'm': /* Procedure mode. 1: phys 2: virt */
+ count=sscanf(optarg, "%x", &m);
+ if (count!=1 || (m!=1 && m!=2)) {
+ printf("Wrong mode number.\n");
+ help();
+ return -1;
+ }
+ mode=m;
+ break;
+ case 'i': /* Inject errors */
+ do_err_inj=1;
+ break;
+ case 'q': /* Query */
+ do_query_all=1;
+ break;
+ case 'v': /* Verbose */
+ verbose=1;
+ break;
+ case 'l': /* One lock per cpu */
+ one_lock=0;
+ break;
+ case 'e': /* error arguments */
+ /* Take parameters:
+ * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer]
+ * err_data_buffer is optional. Recommend not to specify
+ * err_data_buffer. Better to use tool to generate it.
+ */
+ count=sscanf(optarg,
+ "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
+ &line_para.cpu,
+ &line_para.loop,
+ &line_para.interval,
+ &line_para.err_type_info,
+ &line_para.err_struct_info,
+ &line_para.err_data_buffer[0],
+ &line_para.err_data_buffer[1],
+ &line_para.err_data_buffer[2]);
+ if (count!=PARA_FIELD_NUM+3) {
+ line_para.err_data_buffer[0]=-1,
+ line_para.err_data_buffer[1]=-1,
+ line_para.err_data_buffer[2]=-1;
+ count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n",
+ &line_para.cpu,
+ &line_para.loop,
+ &line_para.interval,
+ &line_para.err_type_info,
+ &line_para.err_struct_info);
+ if (count!=PARA_FIELD_NUM) {
+ printf("Wrong error arguments.\n");
+ help();
+ return -1;
+ }
+ }
+ para=1;
+ break;
+ continue;
+ break;
+ case 'h':
+ help();
+ return 0;
+ default:
+ break;
+ }
+
+ if (do_query_all)
+ query_all_capabilities();
+ if (do_err_inj)
+ err_inj();
+
+ if (!do_query_all && !do_err_inj)
+ help();
+
+ return 0;
+}
+
diff --git a/Documentation/ioctl-number.txt b/Documentation/ioctl-number.txt
index 8f750c0..3de7d37 100644
--- a/Documentation/ioctl-number.txt
+++ b/Documentation/ioctl-number.txt
@@ -138,7 +138,8 @@ Code Seq# Include File Comments
'm' 00-1F net/irda/irmod.h conflict!
'n' 00-7F linux/ncp_fs.h
'n' E0-FF video/matrox.h matroxfb
-'p' 00-3F linux/mc146818rtc.h
+'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this)
+'p' 00-3F linux/mc146818rtc.h conflict!
'p' 40-7F linux/nvram.h
'p' 80-9F user-space parport
<mailto:tim@cyberelk.net>
diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.txt
index 769ee05..1d247d5 100644
--- a/Documentation/kbuild/modules.txt
+++ b/Documentation/kbuild/modules.txt
@@ -249,7 +249,7 @@ following files:
--> filename: Makefile
KERNELDIR := /lib/modules/`uname -r`/build
all::
- $(MAKE) -C $KERNELDIR M=`pwd` $@
+ $(MAKE) -C $(KERNELDIR) M=`pwd` $@
# Module specific targets
genbin:
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 38d7db3..6b8ad06 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -496,6 +496,30 @@ and is between 256 and 4096 characters. It is defined in the file
Format: <area>[,<node>]
See also Documentation/networking/decnet.txt.
+ default_blu= [VT]
+ Format: <blue0>,<blue1>,<blue2>,...,<blue15>
+ Change the default blue palette of the console.
+ This is a 16-member array composed of values
+ ranging from 0-255.
+
+ default_grn= [VT]
+ Format: <green0>,<green1>,<green2>,...,<green15>
+ Change the default green palette of the console.
+ This is a 16-member array composed of values
+ ranging from 0-255.
+
+ default_red= [VT]
+ Format: <red0>,<red1>,<red2>,...,<red15>
+ Change the default red palette of the console.
+ This is a 16-member array composed of values
+ ranging from 0-255.
+
+ default_utf8= [VT]
+ Format=<0|1>
+ Set system-wide default UTF-8 mode for all tty's.
+ Default is 0 and by setting to 1, it enables UTF-8
+ mode for all newly opened or allocated terminals.
+
dhash_entries= [KNL]
Set number of hash buckets for dentry cache.
@@ -816,6 +840,11 @@ and is between 256 and 4096 characters. It is defined in the file
lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip
Format: addr:<io>,irq:<irq>
+ legacy_serial.force [HW,IA-32,X86-64]
+ Probe for COM ports at legacy addresses even
+ if PNPBIOS or ACPI should describe them. This
+ is for working around firmware defects.
+
llsc*= [IA64] See function print_params() in
arch/ia64/sn/kernel/llsc4.c.
@@ -1578,6 +1607,17 @@ and is between 256 and 4096 characters. It is defined in the file
smp-alt-once [IA-32,SMP] On a hotplug CPU system, only
attempt to substitute SMP alternatives once at boot.
+ smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices
+ smsc-ircc2.ircc_cfg= [HW] Device configuration I/O port
+ smsc-ircc2.ircc_sir= [HW] SIR base I/O port
+ smsc-ircc2.ircc_fir= [HW] FIR base I/O port
+ smsc-ircc2.ircc_irq= [HW] IRQ line
+ smsc-ircc2.ircc_dma= [HW] DMA channel
+ smsc-ircc2.ircc_transceiver= [HW] Transceiver type:
+ 0: Toshiba Satellite 1800 (GP data pin select)
+ 1: Fast pin select (default)
+ 2: ATC IRMode
+
snd-ad1816a= [HW,ALSA]
snd-ad1848= [HW,ALSA]
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index d71faff..da5404a 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -14,6 +14,7 @@ CONTENTS
8. Kprobes Example
9. Jprobes Example
10. Kretprobes Example
+Appendix A: The kprobes debugfs interface
1. Concepts: Kprobes, Jprobes, Return Probes
@@ -349,9 +350,12 @@ for instrumentation and error reporting.)
If the number of times a function is called does not match the number
of times it returns, registering a return probe on that function may
-produce undesirable results. We have the do_exit() case covered.
-do_execve() and do_fork() are not an issue. We're unaware of other
-specific cases where this could be a problem.
+produce undesirable results. In such a case, a line:
+kretprobe BUG!: Processing kretprobe d000000000041aa8 @ c00000000004f48c
+gets printed. With this information, one will be able to correlate the
+exact instance of the kretprobe that caused the problem. We have the
+do_exit() case covered. do_execve() and do_fork() are not an issue.
+We're unaware of other specific cases where this could be a problem.
If, upon entry to or exit from a function, the CPU is running on
a stack other than that of the current task, registering a return
@@ -614,3 +618,27 @@ http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe
http://www.redhat.com/magazine/005mar05/features/kprobes/
http://www-users.cs.umn.edu/~boutcher/kprobes/
http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115)
+
+
+Appendix A: The kprobes debugfs interface
+
+With recent kernels (> 2.6.20) the list of registered kprobes is visible
+under the /debug/kprobes/ directory (assuming debugfs is mounted at /debug).
+
+/debug/kprobes/list: Lists all registered probes on the system
+
+c015d71a k vfs_read+0x0
+c011a316 j do_fork+0x0
+c03dedc5 r tcp_v4_rcv+0x0
+
+The first column provides the kernel address where the probe is inserted.
+The second column identifies the type of probe (k - kprobe, r - kretprobe
+and j - jprobe), while the third column specifies the symbol+offset of
+the probe. If the probed function belongs to a module, the module name
+is also specified.
+
+/debug/kprobes/enabled: Turn kprobes ON/OFF
+
+Provides a knob to globally turn registered kprobes ON or OFF. By default,
+all kprobes are enabled. By echoing "0" to this file, all registered probes
+will be disarmed, till such time a "1" is echoed to this file.
diff --git a/Documentation/laptop-mode.txt b/Documentation/laptop-mode.txt
index 6f639e3..eeedee1 100644
--- a/Documentation/laptop-mode.txt
+++ b/Documentation/laptop-mode.txt
@@ -33,7 +33,7 @@ or anything. Simply install all the files included in this document, and
laptop mode will automatically be started when you're on battery. For
your convenience, a tarball containing an installer can be downloaded at:
-http://www.xs4all.nl/~bsamwel/laptop_mode/tools/
+http://www.samwel.tk/laptop_mode/laptop_mode/
To configure laptop mode, you need to edit the configuration file, which is
located in /etc/default/laptop-mode on Debian-based systems, or in
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index ea55ea8..7d5b60d 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -234,9 +234,6 @@ characters, each representing a particular tainted value.
6: 'B' if a page-release function has found a bad page reference or
some unexpected page flags.
- 7: 'U' if a user specifically requested that the Tainted flag be set,
- ' ' otherwise.
-
7: 'U' if a user or user application specifically requested that the
Tainted flag be set, ' ' otherwise.
diff --git a/Documentation/pcmcia/driver.txt b/Documentation/pcmcia/driver.txt
new file mode 100644
index 0000000..0ac1679
--- /dev/null
+++ b/Documentation/pcmcia/driver.txt
@@ -0,0 +1,30 @@
+PCMCIA Driver
+-------------
+
+
+sysfs
+-----
+
+New PCMCIA IDs may be added to a device driver pcmcia_device_id table at
+runtime as shown below:
+
+echo "match_flags manf_id card_id func_id function device_no \
+prod_id_hash[0] prod_id_hash[1] prod_id_hash[2] prod_id_hash[3]" > \
+/sys/bus/pcmcia/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The meaning is described in the PCMCIA specification, the match_flags is
+a bitwise or-ed combination from PCMCIA_DEV_ID_MATCH_* constants
+defined in include/linux/mod_devicetable.h.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCMCIA device listed in its (newly updated) pcmcia_device_id list.
+
+A common use-case is to add a new device according to the manufacturer ID
+and the card ID (form the manf_id and card_id file in the device tree).
+For this, just use:
+
+echo "0x3 manf_id card_id 0 0 0 0 0 0 0" > \
+ /sys/bus/pcmcia/drivers/{driver}/new_id
+
+after loading the driver.
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
new file mode 100644
index 0000000..1a85e2b
--- /dev/null
+++ b/Documentation/power/basic-pm-debugging.txt
@@ -0,0 +1,106 @@
+Debugging suspend and resume
+ (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Testing suspend to disk (STD)
+
+To verify that the STD works, you can try to suspend in the "reboot" mode:
+
+# echo reboot > /sys/power/disk
+# echo disk > /sys/power/state
+
+and the system should suspend, reboot, resume and get back to the command prompt
+where you have started the transition. If that happens, the STD is most likely
+to work correctly, but you need to repeat the test at least a couple of times in
+a row for confidence. This is necessary, because some problems only show up on
+a second attempt at suspending and resuming the system. You should also test
+the "platform" and "shutdown" modes of suspend:
+
+# echo platform > /sys/power/disk
+# echo disk > /sys/power/state
+
+or
+
+# echo shutdown > /sys/power/disk
+# echo disk > /sys/power/state
+
+in which cases you will have to press the power button to make the system
+resume. If that does not work, you will need to identify what goes wrong.
+
+a) Test mode of STD
+
+To verify if there are any drivers that cause problems you can run the STD
+in the test mode:
+
+# echo test > /sys/power/disk
+# echo disk > /sys/power/state
+
+in which case the system should freeze tasks, suspend devices, disable nonboot
+CPUs (if any), wait for 5 seconds, enable nonboot CPUs, resume devices, thaw
+tasks and return to your command prompt. If that fails, most likely there is
+a driver that fails to either suspend or resume (in the latter case the system
+may hang or be unstable after the test, so please take that into consideration).
+To find this driver, you can carry out a binary search according to the rules:
+- if the test fails, unload a half of the drivers currently loaded and repeat
+(that would probably involve rebooting the system, so always note what drivers
+have been loaded before the test),
+- if the test succeeds, load a half of the drivers you have unloaded most
+recently and repeat.
+
+Once you have found the failing driver (there can be more than just one of
+them), you have to unload it every time before the STD transition. In that case
+please make sure to report the problem with the driver.
+
+It is also possible that a cycle can still fail after you have unloaded
+all modules. In that case, you would want to look in your kernel configuration
+for the drivers that can be compiled as modules (testing again with them as
+modules), and possibly also try boot time options such as "noapic" or "noacpi".
+
+b) Testing minimal configuration
+
+If the test mode of STD works, you can boot the system with "init=/bin/bash"
+and attempt to suspend in the "reboot", "shutdown" and "platform" modes. If
+that does not work, there probably is a problem with a driver statically
+compiled into the kernel and you can try to compile more drivers as modules,
+so that they can be tested individually. Otherwise, there is a problem with a
+modular driver and you can find it by loading a half of the modules you normally
+use and binary searching in accordance with the algorithm:
+- if there are n modules loaded and the attempt to suspend and resume fails,
+unload n/2 of the modules and try again (that would probably involve rebooting
+the system),
+- if there are n modules loaded and the attempt to suspend and resume succeeds,
+load n/2 modules more and try again.
+
+Again, if you find the offending module(s), it(they) must be unloaded every time
+before the STD transition, and please report the problem with it(them).
+
+c) Advanced debugging
+
+In case the STD does not work on your system even in the minimal configuration
+and compiling more drivers as modules is not practical or some modules cannot
+be unloaded, you can use one of the more advanced debugging techniques to find
+the problem. First, if there is a serial port in your box, you can set the
+CONFIG_DISABLE_CONSOLE_SUSPEND kernel configuration option and try to log kernel
+messages using the serial console. This may provide you with some information
+about the reasons of the suspend (resume) failure. Alternatively, it may be
+possible to use a FireWire port for debugging with firescope
+(ftp://ftp.firstfloor.org/pub/ak/firescope/). On i386 it is also possible to
+use the PM_TRACE mechanism documented in Documentation/s2ram.txt .
+
+2. Testing suspend to RAM (STR)
+
+To verify that the STR works, it is generally more convenient to use the s2ram
+tool available from http://suspend.sf.net and documented at
+http://en.opensuse.org/s2ram . However, before doing that it is recommended to
+carry out the procedure described in section 1.
+
+Assume you have resolved the problems with the STD and you have found some
+failing drivers. These drivers are also likely to fail during the STR or
+during the resume, so it is better to unload them every time before the STR
+transition. Now, you can follow the instructions at
+http://en.opensuse.org/s2ram to test the system, but if it does not work
+"out of the box", you may need to boot it with "init=/bin/bash" and test
+s2ram in the minimal configuration. In that case, you may be able to search
+for failing drivers by following the procedure analogous to the one described in
+1b). If you find some failing drivers, you will have to unload them every time
+before the STR transition (ie. before you run s2ram), and please report the
+problems with them.
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
new file mode 100644
index 0000000..33016c2
--- /dev/null
+++ b/Documentation/power/drivers-testing.txt
@@ -0,0 +1,42 @@
+Testing suspend and resume support in device drivers
+ (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Preparing the test system
+
+Unfortunately, to effectively test the support for the system-wide suspend and
+resume transitions in a driver, it is necessary to suspend and resume a fully
+functional system with this driver loaded. Moreover, that should be done
+several times, preferably several times in a row, and separately for the suspend
+to disk (STD) and the suspend to RAM (STR) transitions, because each of these
+cases involves different ordering of operations and different interactions with
+the machine's BIOS.
+
+Of course, for this purpose the test system has to be known to suspend and
+resume without the driver being tested. Thus, if possible, you should first
+resolve all suspend/resume-related problems in the test system before you start
+testing the new driver. Please see Documents/power/basic-pm-debugging.txt for
+more information about the debugging of suspend/resume functionality.
+
+2. Testing the driver
+
+Once you have resolved the suspend/resume-related problems with your test system
+without the new driver, you are ready to test it:
+
+a) Build the driver as a module, load it and try the STD in the test mode (see:
+Documents/power/basic-pm-debugging.txt, 1a)).
+
+b) Load the driver and attempt to suspend to disk in the "reboot", "shutdown"
+and "platform" modes (see: Documents/power/basic-pm-debugging.txt, 1).
+
+c) Compile the driver directly into the kernel and try the STD in the test mode.
+
+d) Attempt to suspend to disk with the driver compiled directly into the kernel
+in the "reboot", "shutdown" and "platform" modes.
+
+e) Attempt to suspend to RAM using the s2ram tool with the driver loaded (see:
+Documents/power/basic-pm-debugging.txt, 2). As far as the STR tests are
+concerned, it should not matter whether or not the driver is built as a module.
+
+Each of the above tests should be repeated several times and the STD tests
+should be mixed with the STR tests. If any of them fails, the driver cannot be
+regarded as suspend/resume-safe.
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index 8c5b41b..fd5192a 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -34,8 +34,12 @@ for 5 seconds, resume devices, unfreeze tasks and enable nonboot CPUs. Then,
we are able to look in the log messages and work out, for example, which code
is being slow and which device drivers are misbehaving.
-Reading from this file will display what the mode is currently set
-to. Writing to this file will accept one of
+Reading from this file will display all supported modes and the currently
+selected one in brackets, for example
+
+ [shutdown] reboot test testproc
+
+Writing to this file will accept one of
'platform' (only if the platform supports it)
'shutdown'
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 033a3f3..d4bfae7 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1560,6 +1560,9 @@ platforms are moved over to use the flattened-device-tree model.
network device. This is used by the bootwrapper to interpret
MAC addresses passed by the firmware when no information other
than indices is available to associate an address with a device.
+ - phy-connection-type : a string naming the controller/PHY interface type,
+ i.e., "mii" (default), "rmii", "gmii", "rgmii", "rgmii-id", "tbi",
+ or "rtbi".
Example:
ucc@2000 {
@@ -1574,6 +1577,7 @@ platforms are moved over to use the flattened-device-tree model.
rx-clock = "none";
tx-clock = "clk9";
phy-handle = <212000>;
+ phy-connection-type = "gmii";
pio-handle = <140001>;
};
diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt
index 1ef6bb8..7c701b8 100644
--- a/Documentation/rtc.txt
+++ b/Documentation/rtc.txt
@@ -147,7 +147,7 @@ RTC class framework, but can't be supported by the older driver.
* RTC_AIE_ON, RTC_AIE_OFF, RTC_ALM_SET, RTC_ALM_READ ... when the RTC
is connected to an IRQ line, it can often issue an alarm IRQ up to
- 24 hours in the future.
+ 24 hours in the future. (Use RTC_WKALM_* by preference.)
* RTC_WKALM_SET, RTC_WKALM_RD ... RTCs that can issue alarms beyond
the next 24 hours use a slightly more powerful API, which supports
@@ -175,10 +175,7 @@ driver returns ENOIOCTLCMD. Some common examples:
called with appropriate values.
* RTC_ALM_SET, RTC_ALM_READ, RTC_WKALM_SET, RTC_WKALM_RD: the
- set_alarm/read_alarm functions will be called. To differentiate
- between the ALM and WKALM, check the larger fields of the rtc_wkalrm
- struct (like tm_year). These will be set to -1 when using ALM and
- will be set to proper values when using WKALM.
+ set_alarm/read_alarm functions will be called.
* RTC_IRQP_SET, RTC_IRQP_READ: the irq_set_freq function will be called
to set the frequency while the framework will handle the read for you
diff --git a/Documentation/sh/clk.txt b/Documentation/sh/clk.txt
new file mode 100644
index 0000000..9aef710
--- /dev/null
+++ b/Documentation/sh/clk.txt
@@ -0,0 +1,32 @@
+Clock framework on SuperH architecture
+
+The framework on SH extends existing API by the function clk_set_rate_ex,
+which prototype is as follows:
+
+ clk_set_rate_ex (struct clk *clk, unsigned long rate, int algo_id)
+
+The algo_id parameter is used to specify algorithm used to recalculate clocks,
+adjanced to clock, specified as first argument. It is assumed that algo_id==0
+means no changes to adjanced clock
+
+Internally, the clk_set_rate_ex forwards request to clk->ops->set_rate method,
+if it is present in ops structure. The method should set the clock rate and adjust
+all needed clocks according to the passed algo_id.
+Exact values for algo_id are machine-dependend. For the sh7722, the following
+values are defined:
+
+ NO_CHANGE = 0,
+ IUS_N1_N1, /* I:U = N:1, U:Sh = N:1 */
+ IUS_322, /* I:U:Sh = 3:2:2 */
+ IUS_522, /* I:U:Sh = 5:2:2 */
+ IUS_N11, /* I:U:Sh = N:1:1 */
+ SB_N1, /* Sh:B = N:1 */
+ SB3_N1, /* Sh:B3 = N:1 */
+ SB3_32, /* Sh:B3 = 3:2 */
+ SB3_43, /* Sh:B3 = 4:3 */
+ SB3_54, /* Sh:B3 = 5:4 */
+ BP_N1, /* B:P = N:1 */
+ IP_N1 /* I:P = N:1 */
+
+Each of these constants means relation between clocks that can be set via the FRQCR
+register
diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx
index f9717fe..215e3b8 100644
--- a/Documentation/spi/pxa2xx
+++ b/Documentation/spi/pxa2xx
@@ -62,7 +62,7 @@ static struct resource pxa_spi_nssp_resources[] = {
static struct pxa2xx_spi_master pxa_nssp_master_info = {
.ssp_type = PXA25x_NSSP, /* Type of SSP */
- .clock_enable = CKEN9_NSSP, /* NSSP Peripheral clock */
+ .clock_enable = CKEN_NSSP, /* NSSP Peripheral clock */
.num_chipselect = 1, /* Matches the number of chips attached to NSSP */
.enable_dma = 1, /* Enables NSSP DMA */
};
diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index ecc7c9e..795fbb4 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -8,7 +8,7 @@ What is SPI?
The "Serial Peripheral Interface" (SPI) is a synchronous four wire serial
link used to connect microcontrollers to sensors, memory, and peripherals.
-The three signal wires hold a clock (SCLK, often on the order of 10 MHz),
+The three signal wires hold a clock (SCK, often on the order of 10 MHz),
and parallel data lines with "Master Out, Slave In" (MOSI) or "Master In,
Slave Out" (MISO) signals. (Other names are also used.) There are four
clocking modes through which data is exchanged; mode-0 and mode-3 are most
@@ -22,7 +22,7 @@ other signals, often including an interrupt to the master.
Unlike serial busses like USB or SMBUS, even low level protocols for
SPI slave functions are usually not interoperable between vendors
-(except for cases like SPI memory chips).
+(except for commodities like SPI memory chips).
- SPI may be used for request/response style device protocols, as with
touchscreen sensors and memory chips.
@@ -77,8 +77,9 @@ cards without needing a special purpose MMC/SD/SDIO controller.
How do these driver programming interfaces work?
------------------------------------------------
The <linux/spi/spi.h> header file includes kerneldoc, as does the
-main source code, and you should certainly read that. This is just
-an overview, so you get the big picture before the details.
+main source code, and you should certainly read that chapter of the
+kernel API document. This is just an overview, so you get the big
+picture before those details.
SPI requests always go into I/O queues. Requests for a given SPI device
are always executed in FIFO order, and complete asynchronously through
@@ -88,7 +89,7 @@ a command and then reading its response.
There are two types of SPI driver, here called:
- Controller drivers ... these are often built in to System-On-Chip
+ Controller drivers ... controllers may be built in to System-On-Chip
processors, and often support both Master and Slave roles.
These drivers touch hardware registers and may use DMA.
Or they can be PIO bitbangers, needing just GPIO pins.
@@ -108,18 +109,18 @@ those two types of driver. At this writing, Linux has no slave side
programming interface.
There is a minimal core of SPI programming interfaces, focussing on
-using driver model to connect controller and protocol drivers using
+using the driver model to connect controller and protocol drivers using
device tables provided by board specific initialization code. SPI
shows up in sysfs in several locations:
- /sys/devices/.../CTLR/spiB.C ... spi_device for on bus "B",
+ /sys/devices/.../CTLR/spiB.C ... spi_device on bus "B",
chipselect C, accessed through CTLR.
/sys/devices/.../CTLR/spiB.C/modalias ... identifies the driver
that should be used with this device (for hotplug/coldplug)
/sys/bus/spi/devices/spiB.C ... symlink to the physical
- spiB-C device
+ spiB.C device
/sys/bus/spi/drivers/D ... driver for one or more spi*.* devices
@@ -240,7 +241,7 @@ The board_info should provide enough information to let the system work
without the chip's driver being loaded. The most troublesome aspect of
that is likely the SPI_CS_HIGH bit in the spi_device.mode field, since
sharing a bus with a device that interprets chipselect "backwards" is
-not possible.
+not possible until the infrastructure knows how to deselect it.
Then your board initialization code would register that table with the SPI
infrastructure, so that it's available later when the SPI master controller
@@ -268,16 +269,14 @@ board info based on the board that was hotplugged. Of course, you'd later
call at least spi_unregister_device() when that board is removed.
When Linux includes support for MMC/SD/SDIO/DataFlash cards through SPI, those
-configurations will also be dynamic. Fortunately, those devices all support
-basic device identification probes, so that support should hotplug normally.
+configurations will also be dynamic. Fortunately, such devices all support
+basic device identification probes, so they should hotplug normally.
How do I write an "SPI Protocol Driver"?
----------------------------------------
-All SPI drivers are currently kernel drivers. A userspace driver API
-would just be another kernel driver, probably offering some lowlevel
-access through aio_read(), aio_write(), and ioctl() calls and using the
-standard userspace sysfs mechanisms to bind to a given SPI device.
+Most SPI drivers are currently kernel drivers, but there's also support
+for userspace drivers. Here we talk only about kernel drivers.
SPI protocol drivers somewhat resemble platform device drivers:
@@ -319,7 +318,8 @@ might look like this unless you're creating a class_device:
As soon as it enters probe(), the driver may issue I/O requests to
the SPI device using "struct spi_message". When remove() returns,
-the driver guarantees that it won't submit any more such messages.
+or after probe() fails, the driver guarantees that it won't submit
+any more such messages.
- An spi_message is a sequence of protocol operations, executed
as one atomic sequence. SPI driver controls include:
@@ -368,7 +368,8 @@ the driver guarantees that it won't submit any more such messages.
Some drivers may need to modify spi_device characteristics like the
transfer mode, wordsize, or clock rate. This is done with spi_setup(),
which would normally be called from probe() before the first I/O is
-done to the device.
+done to the device. However, that can also be called at any time
+that no message is pending for that device.
While "spi_device" would be the bottom boundary of the driver, the
upper boundaries might include sysfs (especially for sensor readings),
@@ -445,11 +446,15 @@ SPI MASTER METHODS
This sets up the device clock rate, SPI mode, and word sizes.
Drivers may change the defaults provided by board_info, and then
call spi_setup(spi) to invoke this routine. It may sleep.
+ Unless each SPI slave has its own configuration registers, don't
+ change them right away ... otherwise drivers could corrupt I/O
+ that's in progress for other SPI devices.
master->transfer(struct spi_device *spi, struct spi_message *message)
This must not sleep. Its responsibility is arrange that the
- transfer happens and its complete() callback is issued; the two
- will normally happen later, after other transfers complete.
+ transfer happens and its complete() callback is issued. The two
+ will normally happen later, after other transfers complete, and
+ if the controller is idle it will need to be kickstarted.
master->cleanup(struct spi_device *spi)
Your controller driver may use spi_device.controller_state to hold
diff --git a/Documentation/spi/spidev b/Documentation/spi/spidev
new file mode 100644
index 0000000..5c8e1b98
--- /dev/null
+++ b/Documentation/spi/spidev
@@ -0,0 +1,307 @@
+SPI devices have a limited userspace API, supporting basic half-duplex
+read() and write() access to SPI slave devices. Using ioctl() requests,
+full duplex transfers and device I/O configuration are also available.
+
+ #include <fcntl.h>
+ #include <unistd.h>
+ #include <sys/ioctl.h>
+ #include <linux/types.h>
+ #include <linux/spi/spidev.h>
+
+Some reasons you might want to use this programming interface include:
+
+ * Prototyping in an environment that's not crash-prone; stray pointers
+ in userspace won't normally bring down any Linux system.
+
+ * Developing simple protocols used to talk to microcontrollers acting
+ as SPI slaves, which you may need to change quite often.
+
+Of course there are drivers that can never be written in userspace, because
+they need to access kernel interfaces (such as IRQ handlers or other layers
+of the driver stack) that are not accessible to userspace.
+
+
+DEVICE CREATION, DRIVER BINDING
+===============================
+The simplest way to arrange to use this driver is to just list it in the
+spi_board_info for a device as the driver it should use: the "modalias"
+entry is "spidev", matching the name of the driver exposing this API.
+Set up the other device characteristics (bits per word, SPI clocking,
+chipselect polarity, etc) as usual, so you won't always need to override
+them later.
+
+(Sysfs also supports userspace driven binding/unbinding of drivers to
+devices. That mechanism might be supported here in the future.)
+
+When you do that, the sysfs node for the SPI device will include a child
+device node with a "dev" attribute that will be understood by udev or mdev.
+(Larger systems will have "udev". Smaller ones may configure "mdev" into
+busybox; it's less featureful, but often enough.) For a SPI device with
+chipselect C on bus B, you should see:
+
+ /dev/spidevB.C ... character special device, major number 153 with
+ a dynamically chosen minor device number. This is the node
+ that userspace programs will open, created by "udev" or "mdev".
+
+ /sys/devices/.../spiB.C ... as usual, the SPI device node will
+ be a child of its SPI master controller.
+
+ /sys/class/spidev/spidevB.C ... created when the "spidev" driver
+ binds to that device. (Directory or symlink, based on whether
+ or not you enabled the "deprecated sysfs files" Kconfig option.)
+
+Do not try to manage the /dev character device special file nodes by hand.
+That's error prone, and you'd need to pay careful attention to system
+security issues; udev/mdev should already be configured securely.
+
+If you unbind the "spidev" driver from that device, those two "spidev" nodes
+(in sysfs and in /dev) should automatically be removed (respectively by the
+kernel and by udev/mdev). You can unbind by removing the "spidev" driver
+module, which will affect all devices using this driver. You can also unbind
+by having kernel code remove the SPI device, probably by removing the driver
+for its SPI controller (so its spi_master vanishes).
+
+Since this is a standard Linux device driver -- even though it just happens
+to expose a low level API to userspace -- it can be associated with any number
+of devices at a time. Just provide one spi_board_info record for each such
+SPI device, and you'll get a /dev device node for each device.
+
+
+BASIC CHARACTER DEVICE API
+==========================
+Normal open() and close() operations on /dev/spidevB.D files work as you
+would expect.
+
+Standard read() and write() operations are obviously only half-duplex, and
+the chipselect is deactivated between those operations. Full-duplex access,
+and composite operation without chipselect de-activation, is available using
+the SPI_IOC_MESSAGE(N) request.
+
+Several ioctl() requests let your driver read or override the device's current
+settings for data transfer parameters:
+
+ SPI_IOC_RD_MODE, SPI_IOC_WR_MODE ... pass a pointer to a byte which will
+ return (RD) or assign (WR) the SPI transfer mode. Use the constants
+ SPI_MODE_0..SPI_MODE_3; or if you prefer you can combine SPI_CPOL
+ (clock polarity, idle high iff this is set) or SPI_CPHA (clock phase,
+ sample on trailing edge iff this is set) flags.
+
+ SPI_IOC_RD_LSB_FIRST, SPI_IOC_WR_LSB_FIRST ... pass a pointer to a byte
+ which will return (RD) or assign (WR) the bit justification used to
+ transfer SPI words. Zero indicates MSB-first; other values indicate
+ the less common LSB-first encoding. In both cases the specified value
+ is right-justified in each word, so that unused (TX) or undefined (RX)
+ bits are in the MSBs.
+
+ SPI_IOC_RD_BITS_PER_WORD, SPI_IOC_WR_BITS_PER_WORD ... pass a pointer to
+ a byte which will return (RD) or assign (WR) the number of bits in
+ each SPI transfer word. The value zero signifies eight bits.
+
+ SPI_IOC_RD_MAX_SPEED_HZ, SPI_IOC_WR_MAX_SPEED_HZ ... pass a pointer to a
+ u32 which will return (RD) or assign (WR) the maximum SPI transfer
+ speed, in Hz. The controller can't necessarily assign that specific
+ clock speed.
+
+NOTES:
+
+ - At this time there is no async I/O support; everything is purely
+ synchronous.
+
+ - There's currently no way to report the actual bit rate used to
+ shift data to/from a given device.
+
+ - From userspace, you can't currently change the chip select polarity;
+ that could corrupt transfers to other devices sharing the SPI bus.
+ Each SPI device is deselected when it's not in active use, allowing
+ other drivers to talk to other devices.
+
+ - There's a limit on the number of bytes each I/O request can transfer
+ to the SPI device. It defaults to one page, but that can be changed
+ using a module parameter.
+
+ - Because SPI has no low-level transfer acknowledgement, you usually
+ won't see any I/O errors when talking to a non-existent device.
+
+
+FULL DUPLEX CHARACTER DEVICE API
+================================
+
+See the sample program below for one example showing the use of the full
+duplex programming interface. (Although it doesn't perform a full duplex
+transfer.) The model is the same as that used in the kernel spi_sync()
+request; the individual transfers offer the same capabilities as are
+available to kernel drivers (except that it's not asynchronous).
+
+The example shows one half-duplex RPC-style request and response message.
+These requests commonly require that the chip not be deselected between
+the request and response. Several such requests could be chained into
+a single kernel request, even allowing the chip to be deselected after
+each response. (Other protocol options include changing the word size
+and bitrate for each transfer segment.)
+
+To make a full duplex request, provide both rx_buf and tx_buf for the
+same transfer. It's even OK if those are the same buffer.
+
+
+SAMPLE PROGRAM
+==============
+
+-------------------------------- CUT HERE
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/types.h>
+#include <linux/spi/spidev.h>
+
+
+static int verbose;
+
+static void do_read(int fd, int len)
+{
+ unsigned char buf[32], *bp;
+ int status;
+
+ /* read at least 2 bytes, no more than 32 */
+ if (len < 2)
+ len = 2;
+ else if (len > sizeof(buf))
+ len = sizeof(buf);
+ memset(buf, 0, sizeof buf);
+
+ status = read(fd, buf, len);
+ if (status < 0) {
+ perror("read");
+ return;
+ }
+ if (status != len) {
+ fprintf(stderr, "short read\n");
+ return;
+ }
+
+ printf("read(%2d, %2d): %02x %02x,", len, status,
+ buf[0], buf[1]);
+ status -= 2;
+ bp = buf + 2;
+ while (status-- > 0)
+ printf(" %02x", *bp++);
+ printf("\n");
+}
+
+static void do_msg(int fd, int len)
+{
+ struct spi_ioc_transfer xfer[2];
+ unsigned char buf[32], *bp;
+ int status;
+
+ memset(xfer, 0, sizeof xfer);
+ memset(buf, 0, sizeof buf);
+
+ if (len > sizeof buf)
+ len = sizeof buf;
+
+ buf[0] = 0xaa;
+ xfer[0].tx_buf = (__u64) buf;
+ xfer[0].len = 1;
+
+ xfer[1].rx_buf = (__u64) buf;
+ xfer[1].len = len;
+
+ status = ioctl(fd, SPI_IOC_MESSAGE(2), xfer);
+ if (status < 0) {
+ perror("SPI_IOC_MESSAGE");
+ return;
+ }
+
+ printf("response(%2d, %2d): ", len, status);
+ for (bp = buf; len; len--)
+ printf(" %02x", *bp++);
+ printf("\n");
+}
+
+static void dumpstat(const char *name, int fd)
+{
+ __u8 mode, lsb, bits;
+ __u32 speed;
+
+ if (ioctl(fd, SPI_IOC_RD_MODE, &mode) < 0) {
+ perror("SPI rd_mode");
+ return;
+ }
+ if (ioctl(fd, SPI_IOC_RD_LSB_FIRST, &lsb) < 0) {
+ perror("SPI rd_lsb_fist");
+ return;
+ }
+ if (ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits) < 0) {
+ perror("SPI bits_per_word");
+ return;
+ }
+ if (ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed) < 0) {
+ perror("SPI max_speed_hz");
+ return;
+ }
+
+ printf("%s: spi mode %d, %d bits %sper word, %d Hz max\n",
+ name, mode, bits, lsb ? "(lsb first) " : "", speed);
+}
+
+int main(int argc, char **argv)
+{
+ int c;
+ int readcount = 0;
+ int msglen = 0;
+ int fd;
+ const char *name;
+
+ while ((c = getopt(argc, argv, "hm:r:v")) != EOF) {
+ switch (c) {
+ case 'm':
+ msglen = atoi(optarg);
+ if (msglen < 0)
+ goto usage;
+ continue;
+ case 'r':
+ readcount = atoi(optarg);
+ if (readcount < 0)
+ goto usage;
+ continue;
+ case 'v':
+ verbose++;
+ continue;
+ case 'h':
+ case '?':
+usage:
+ fprintf(stderr,
+ "usage: %s [-h] [-m N] [-r N] /dev/spidevB.D\n",
+ argv[0]);
+ return 1;
+ }
+ }
+
+ if ((optind + 1) != argc)
+ goto usage;
+ name = argv[optind];
+
+ fd = open(name, O_RDWR);
+ if (fd < 0) {
+ perror("open");
+ return 1;
+ }
+
+ dumpstat(name, fd);
+
+ if (msglen)
+ do_msg(fd, msglen);
+
+ if (readcount)
+ do_read(fd, readcount);
+
+ close(fd);
+ return 0;
+}
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e96a341..1d19256 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -197,11 +197,22 @@ and may not be fast.
panic_on_oom
-This enables or disables panic on out-of-memory feature. If this is set to 1,
-the kernel panics when out-of-memory happens. If this is set to 0, the kernel
-will kill some rogue process, called oom_killer. Usually, oom_killer can kill
-rogue processes and system will survive. If you want to panic the system
-rather than killing rogue processes, set this to 1.
+This enables or disables panic on out-of-memory feature.
-The default value is 0.
+If this is set to 0, the kernel will kill some rogue process,
+called oom_killer. Usually, oom_killer can kill rogue processes and
+system will survive.
+
+If this is set to 1, the kernel panics when out-of-memory happens.
+However, if a process limits using nodes by mempolicy/cpusets,
+and those nodes become memory exhaustion status, one process
+may be killed by oom-killer. No panic occurs in this case.
+Because other nodes' memory may be free. This means system total status
+may be not fatal yet.
+If this is set to 2, the kernel panics compulsorily even on the
+above-mentioned.
+
+The default value is 0.
+1 and 2 are for failover of clustering. Please select either
+according to your policy of failover.
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index d43aa9d..ba328f2 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -1,6 +1,6 @@
Linux Magic System Request Key Hacks
Documentation for sysrq.c
-Last update: 2007-JAN-06
+Last update: 2007-MAR-14
* What is the magic SysRq key?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -75,7 +75,7 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
'f' - Will call oom_kill to kill a memory hog process.
-'g' - Used by kgdb on ppc platforms.
+'g' - Used by kgdb on ppc and sh platforms.
'h' - Will display help (actually any other key than those listed
above will display help. but 'h' is easy to remember :-)
diff --git a/Documentation/tty.txt b/Documentation/tty.txt
index 5f799e6..048a876 100644
--- a/Documentation/tty.txt
+++ b/Documentation/tty.txt
@@ -108,7 +108,9 @@ hardware driver through the function pointers within the tty->driver
structure:
write() Write a block of characters to the tty device.
- Returns the number of characters accepted.
+ Returns the number of characters accepted. The
+ character buffer passed to this method is already
+ in kernel space.
put_char() Queues a character for writing to the tty device.
If there is no room in the queue, the character is
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
new file mode 100644
index 0000000..41710cc
--- /dev/null
+++ b/Documentation/vm/slabinfo.c
@@ -0,0 +1,943 @@
+/*
+ * Slabinfo: Tool to get reports about slabs
+ *
+ * (C) 2007 sgi, Christoph Lameter <clameter@sgi.com>
+ *
+ * Compile by:
+ *
+ * gcc -o slabinfo slabinfo.c
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <getopt.h>
+#include <regex.h>
+
+#define MAX_SLABS 500
+#define MAX_ALIASES 500
+#define MAX_NODES 1024
+
+struct slabinfo {
+ char *name;
+ int alias;
+ int refs;
+ int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
+ int hwcache_align, object_size, objs_per_slab;
+ int sanity_checks, slab_size, store_user, trace;
+ int order, poison, reclaim_account, red_zone;
+ unsigned long partial, objects, slabs;
+ int numa[MAX_NODES];
+ int numa_partial[MAX_NODES];
+} slabinfo[MAX_SLABS];
+
+struct aliasinfo {
+ char *name;
+ char *ref;
+ struct slabinfo *slab;
+} aliasinfo[MAX_ALIASES];
+
+int slabs = 0;
+int aliases = 0;
+int alias_targets = 0;
+int highest_node = 0;
+
+char buffer[4096];
+
+int show_alias = 0;
+int show_slab = 0;
+int skip_zero = 1;
+int show_numa = 0;
+int show_track = 0;
+int show_first_alias = 0;
+int validate = 0;
+int shrink = 0;
+int show_inverted = 0;
+int show_single_ref = 0;
+int show_totals = 0;
+int sort_size = 0;
+
+int page_size;
+
+regex_t pattern;
+
+void fatal(const char *x, ...)
+{
+ va_list ap;
+
+ va_start(ap, x);
+ vfprintf(stderr, x, ap);
+ va_end(ap);
+ exit(1);
+}
+
+void usage(void)
+{
+ printf("slabinfo [-ahnpvtsz] [slab-regexp]\n"
+ "-a|--aliases Show aliases\n"
+ "-h|--help Show usage information\n"
+ "-n|--numa Show NUMA information\n"
+ "-s|--shrink Shrink slabs\n"
+ "-v|--validate Validate slabs\n"
+ "-t|--tracking Show alloc/free information\n"
+ "-T|--Totals Show summary information\n"
+ "-l|--slabs Show slabs\n"
+ "-S|--Size Sort by size\n"
+ "-z|--zero Include empty slabs\n"
+ "-f|--first-alias Show first alias\n"
+ "-i|--inverted Inverted list\n"
+ "-1|--1ref Single reference\n"
+ );
+}
+
+unsigned long read_obj(char *name)
+{
+ FILE *f = fopen(name, "r");
+
+ if (!f)
+ buffer[0] = 0;
+ else {
+ if (!fgets(buffer,sizeof(buffer), f))
+ buffer[0] = 0;
+ fclose(f);
+ if (buffer[strlen(buffer)] == '\n')
+ buffer[strlen(buffer)] = 0;
+ }
+ return strlen(buffer);
+}
+
+
+/*
+ * Get the contents of an attribute
+ */
+unsigned long get_obj(char *name)
+{
+ if (!read_obj(name))
+ return 0;
+
+ return atol(buffer);
+}
+
+unsigned long get_obj_and_str(char *name, char **x)
+{
+ unsigned long result = 0;
+ char *p;
+
+ *x = NULL;
+
+ if (!read_obj(name)) {
+ x = NULL;
+ return 0;
+ }
+ result = strtoul(buffer, &p, 10);
+ while (*p == ' ')
+ p++;
+ if (*p)
+ *x = strdup(p);
+ return result;
+}
+
+void set_obj(struct slabinfo *s, char *name, int n)
+{
+ char x[100];
+
+ sprintf(x, "%s/%s", s->name, name);
+
+ FILE *f = fopen(x, "w");
+
+ if (!f)
+ fatal("Cannot write to %s\n", x);
+
+ fprintf(f, "%d\n", n);
+ fclose(f);
+}
+
+/*
+ * Put a size string together
+ */
+int store_size(char *buffer, unsigned long value)
+{
+ unsigned long divisor = 1;
+ char trailer = 0;
+ int n;
+
+ if (value > 1000000000UL) {
+ divisor = 100000000UL;
+ trailer = 'G';
+ } else if (value > 1000000UL) {
+ divisor = 100000UL;
+ trailer = 'M';
+ } else if (value > 1000UL) {
+ divisor = 100;
+ trailer = 'K';
+ }
+
+ value /= divisor;
+ n = sprintf(buffer, "%ld",value);
+ if (trailer) {
+ buffer[n] = trailer;
+ n++;
+ buffer[n] = 0;
+ }
+ if (divisor != 1) {
+ memmove(buffer + n - 2, buffer + n - 3, 4);
+ buffer[n-2] = '.';
+ n++;
+ }
+ return n;
+}
+
+void decode_numa_list(int *numa, char *t)
+{
+ int node;
+ int nr;
+
+ memset(numa, 0, MAX_NODES * sizeof(int));
+
+ while (*t == 'N') {
+ t++;
+ node = strtoul(t, &t, 10);
+ if (*t == '=') {
+ t++;
+ nr = strtoul(t, &t, 10);
+ numa[node] = nr;
+ if (node > highest_node)
+ highest_node = node;
+ }
+ while (*t == ' ')
+ t++;
+ }
+}
+
+void slab_validate(struct slabinfo *s)
+{
+ set_obj(s, "validate", 1);
+}
+
+void slab_shrink(struct slabinfo *s)
+{
+ set_obj(s, "shrink", 1);
+}
+
+int line = 0;
+
+void first_line(void)
+{
+ printf("Name Objects Objsize Space "
+ "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
+}
+
+/*
+ * Find the shortest alias of a slab
+ */
+struct aliasinfo *find_one_alias(struct slabinfo *find)
+{
+ struct aliasinfo *a;
+ struct aliasinfo *best = NULL;
+
+ for(a = aliasinfo;a < aliasinfo + aliases; a++) {
+ if (a->slab == find &&
+ (!best || strlen(best->name) < strlen(a->name))) {
+ best = a;
+ if (strncmp(a->name,"kmall", 5) == 0)
+ return best;
+ }
+ }
+ if (best)
+ return best;
+ fatal("Cannot find alias for %s\n", find->name);
+ return NULL;
+}
+
+unsigned long slab_size(struct slabinfo *s)
+{
+ return s->slabs * (page_size << s->order);
+}
+
+
+void slabcache(struct slabinfo *s)
+{
+ char size_str[20];
+ char dist_str[40];
+ char flags[20];
+ char *p = flags;
+
+ if (skip_zero && !s->slabs)
+ return;
+
+ store_size(size_str, slab_size(s));
+ sprintf(dist_str,"%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs);
+
+ if (!line++)
+ first_line();
+
+ if (s->aliases)
+ *p++ = '*';
+ if (s->cache_dma)
+ *p++ = 'd';
+ if (s->hwcache_align)
+ *p++ = 'A';
+ if (s->poison)
+ *p++ = 'P';
+ if (s->reclaim_account)
+ *p++ = 'a';
+ if (s->red_zone)
+ *p++ = 'Z';
+ if (s->sanity_checks)
+ *p++ = 'F';
+ if (s->store_user)
+ *p++ = 'U';
+ if (s->trace)
+ *p++ = 'T';
+
+ *p = 0;
+ printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
+ s->name, s->objects, s->object_size, size_str, dist_str,
+ s->objs_per_slab, s->order,
+ s->slabs ? (s->partial * 100) / s->slabs : 100,
+ s->slabs ? (s->objects * s->object_size * 100) /
+ (s->slabs * (page_size << s->order)) : 100,
+ flags);
+}
+
+void slab_numa(struct slabinfo *s)
+{
+ int node;
+
+ if (!highest_node)
+ fatal("No NUMA information available.\n");
+
+ if (skip_zero && !s->slabs)
+ return;
+
+ if (!line) {
+ printf("\nSlab Node ");
+ for(node = 0; node <= highest_node; node++)
+ printf(" %4d", node);
+ printf("\n----------------------");
+ for(node = 0; node <= highest_node; node++)
+ printf("-----");
+ printf("\n");
+ }
+ printf("%-21s ", s->name);
+ for(node = 0; node <= highest_node; node++) {
+ char b[20];
+
+ store_size(b, s->numa[node]);
+ printf(" %4s", b);
+ }
+ printf("\n");
+ line++;
+}
+
+void show_tracking(struct slabinfo *s)
+{
+ printf("\n%s: Calls to allocate a slab object\n", s->name);
+ printf("---------------------------------------------------\n");
+ if (read_obj("alloc_calls"))
+ printf(buffer);
+
+ printf("%s: Calls to free a slab object\n", s->name);
+ printf("-----------------------------------------------\n");
+ if (read_obj("free_calls"))
+ printf(buffer);
+
+}
+
+void totals(void)
+{
+ struct slabinfo *s;
+
+ int used_slabs = 0;
+ char b1[20], b2[20], b3[20], b4[20];
+ unsigned long long max = 1ULL << 63;
+
+ /* Object size */
+ unsigned long long min_objsize = max, max_objsize = 0, avg_objsize;
+
+ /* Number of partial slabs in a slabcache */
+ unsigned long long min_partial = max, max_partial = 0,
+ avg_partial, total_partial = 0;
+
+ /* Number of slabs in a slab cache */
+ unsigned long long min_slabs = max, max_slabs = 0,
+ avg_slabs, total_slabs = 0;
+
+ /* Size of the whole slab */
+ unsigned long long min_size = max, max_size = 0,
+ avg_size, total_size = 0;
+
+ /* Bytes used for object storage in a slab */
+ unsigned long long min_used = max, max_used = 0,
+ avg_used, total_used = 0;
+
+ /* Waste: Bytes used for alignment and padding */
+ unsigned long long min_waste = max, max_waste = 0,
+ avg_waste, total_waste = 0;
+ /* Number of objects in a slab */
+ unsigned long long min_objects = max, max_objects = 0,
+ avg_objects, total_objects = 0;
+ /* Waste per object */
+ unsigned long long min_objwaste = max,
+ max_objwaste = 0, avg_objwaste,
+ total_objwaste = 0;
+
+ /* Memory per object */
+ unsigned long long min_memobj = max,
+ max_memobj = 0, avg_memobj,
+ total_objsize = 0;
+
+ /* Percentage of partial slabs per slab */
+ unsigned long min_ppart = 100, max_ppart = 0,
+ avg_ppart, total_ppart = 0;
+
+ /* Number of objects in partial slabs */
+ unsigned long min_partobj = max, max_partobj = 0,
+ avg_partobj, total_partobj = 0;
+
+ /* Percentage of partial objects of all objects in a slab */
+ unsigned long min_ppartobj = 100, max_ppartobj = 0,
+ avg_ppartobj, total_ppartobj = 0;
+
+
+ for (s = slabinfo; s < slabinfo + slabs; s++) {
+ unsigned long long size;
+ unsigned long used;
+ unsigned long long wasted;
+ unsigned long long objwaste;
+ long long objects_in_partial_slabs;
+ unsigned long percentage_partial_slabs;
+ unsigned long percentage_partial_objs;
+
+ if (!s->slabs || !s->objects)
+ continue;
+
+ used_slabs++;
+
+ size = slab_size(s);
+ used = s->objects * s->object_size;
+ wasted = size - used;
+ objwaste = s->slab_size - s->object_size;
+
+ objects_in_partial_slabs = s->objects -
+ (s->slabs - s->partial - s ->cpu_slabs) *
+ s->objs_per_slab;
+
+ if (objects_in_partial_slabs < 0)
+ objects_in_partial_slabs = 0;
+
+ percentage_partial_slabs = s->partial * 100 / s->slabs;
+ if (percentage_partial_slabs > 100)
+ percentage_partial_slabs = 100;
+
+ percentage_partial_objs = objects_in_partial_slabs * 100
+ / s->objects;
+
+ if (percentage_partial_objs > 100)
+ percentage_partial_objs = 100;
+
+ if (s->object_size < min_objsize)
+ min_objsize = s->object_size;
+ if (s->partial < min_partial)
+ min_partial = s->partial;
+ if (s->slabs < min_slabs)
+ min_slabs = s->slabs;
+ if (size < min_size)
+ min_size = size;
+ if (wasted < min_waste)
+ min_waste = wasted;
+ if (objwaste < min_objwaste)
+ min_objwaste = objwaste;
+ if (s->objects < min_objects)
+ min_objects = s->objects;
+ if (used < min_used)
+ min_used = used;
+ if (objects_in_partial_slabs < min_partobj)
+ min_partobj = objects_in_partial_slabs;
+ if (percentage_partial_slabs < min_ppart)
+ min_ppart = percentage_partial_slabs;
+ if (percentage_partial_objs < min_ppartobj)
+ min_ppartobj = percentage_partial_objs;
+ if (s->slab_size < min_memobj)
+ min_memobj = s->slab_size;
+
+ if (s->object_size > max_objsize)
+ max_objsize = s->object_size;
+ if (s->partial > max_partial)
+ max_partial = s->partial;
+ if (s->slabs > max_slabs)
+ max_slabs = s->slabs;
+ if (size > max_size)
+ max_size = size;
+ if (wasted > max_waste)
+ max_waste = wasted;
+ if (objwaste > max_objwaste)
+ max_objwaste = objwaste;
+ if (s->objects > max_objects)
+ max_objects = s->objects;
+ if (used > max_used)
+ max_used = used;
+ if (objects_in_partial_slabs > max_partobj)
+ max_partobj = objects_in_partial_slabs;
+ if (percentage_partial_slabs > max_ppart)
+ max_ppart = percentage_partial_slabs;
+ if (percentage_partial_objs > max_ppartobj)
+ max_ppartobj = percentage_partial_objs;
+ if (s->slab_size > max_memobj)
+ max_memobj = s->slab_size;
+
+ total_partial += s->partial;
+ total_slabs += s->slabs;
+ total_size += size;
+ total_waste += wasted;
+
+ total_objects += s->objects;
+ total_used += used;
+ total_partobj += objects_in_partial_slabs;
+ total_ppart += percentage_partial_slabs;
+ total_ppartobj += percentage_partial_objs;
+
+ total_objwaste += s->objects * objwaste;
+ total_objsize += s->objects * s->slab_size;
+ }
+
+ if (!total_objects) {
+ printf("No objects\n");
+ return;
+ }
+ if (!used_slabs) {
+ printf("No slabs\n");
+ return;
+ }
+
+ /* Per slab averages */
+ avg_partial = total_partial / used_slabs;
+ avg_slabs = total_slabs / used_slabs;
+ avg_size = total_size / used_slabs;
+ avg_waste = total_waste / used_slabs;
+
+ avg_objects = total_objects / used_slabs;
+ avg_used = total_used / used_slabs;
+ avg_partobj = total_partobj / used_slabs;
+ avg_ppart = total_ppart / used_slabs;
+ avg_ppartobj = total_ppartobj / used_slabs;
+
+ /* Per object object sizes */
+ avg_objsize = total_used / total_objects;
+ avg_objwaste = total_objwaste / total_objects;
+ avg_partobj = total_partobj * 100 / total_objects;
+ avg_memobj = total_objsize / total_objects;
+
+ printf("Slabcache Totals\n");
+ printf("----------------\n");
+ printf("Slabcaches : %3d Aliases : %3d->%-3d Active: %3d\n",
+ slabs, aliases, alias_targets, used_slabs);
+
+ store_size(b1, total_size);store_size(b2, total_waste);
+ store_size(b3, total_waste * 100 / total_used);
+ printf("Memory used: %6s # Loss : %6s MRatio: %6s%%\n", b1, b2, b3);
+
+ store_size(b1, total_objects);store_size(b2, total_partobj);
+ store_size(b3, total_partobj * 100 / total_objects);
+ printf("# Objects : %6s # PartObj: %6s ORatio: %6s%%\n", b1, b2, b3);
+
+ printf("\n");
+ printf("Per Cache Average Min Max Total\n");
+ printf("---------------------------------------------------------\n");
+
+ store_size(b1, avg_objects);store_size(b2, min_objects);
+ store_size(b3, max_objects);store_size(b4, total_objects);
+ printf("#Objects %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_slabs);store_size(b2, min_slabs);
+ store_size(b3, max_slabs);store_size(b4, total_slabs);
+ printf("#Slabs %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_partial);store_size(b2, min_partial);
+ store_size(b3, max_partial);store_size(b4, total_partial);
+ printf("#PartSlab %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+ store_size(b1, avg_ppart);store_size(b2, min_ppart);
+ store_size(b3, max_ppart);
+ store_size(b4, total_partial * 100 / total_slabs);
+ printf("%%PartSlab %10s%% %10s%% %10s%% %10s%%\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_partobj);store_size(b2, min_partobj);
+ store_size(b3, max_partobj);
+ store_size(b4, total_partobj);
+ printf("PartObjs %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_ppartobj);store_size(b2, min_ppartobj);
+ store_size(b3, max_ppartobj);
+ store_size(b4, total_partobj * 100 / total_objects);
+ printf("%% PartObj %10s%% %10s%% %10s%% %10s%%\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_size);store_size(b2, min_size);
+ store_size(b3, max_size);store_size(b4, total_size);
+ printf("Memory %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_used);store_size(b2, min_used);
+ store_size(b3, max_used);store_size(b4, total_used);
+ printf("Used %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ store_size(b1, avg_waste);store_size(b2, min_waste);
+ store_size(b3, max_waste);store_size(b4, total_waste);
+ printf("Loss %10s %10s %10s %10s\n",
+ b1, b2, b3, b4);
+
+ printf("\n");
+ printf("Per Object Average Min Max\n");
+ printf("---------------------------------------------\n");
+
+ store_size(b1, avg_memobj);store_size(b2, min_memobj);
+ store_size(b3, max_memobj);
+ printf("Memory %10s %10s %10s\n",
+ b1, b2, b3);
+ store_size(b1, avg_objsize);store_size(b2, min_objsize);
+ store_size(b3, max_objsize);
+ printf("User %10s %10s %10s\n",
+ b1, b2, b3);
+
+ store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
+ store_size(b3, max_objwaste);
+ printf("Loss %10s %10s %10s\n",
+ b1, b2, b3);
+}
+
+void sort_slabs(void)
+{
+ struct slabinfo *s1,*s2;
+
+ for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) {
+ for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
+ int result;
+
+ if (sort_size)
+ result = slab_size(s1) < slab_size(s2);
+ else
+ result = strcasecmp(s1->name, s2->name);
+
+ if (show_inverted)
+ result = -result;
+
+ if (result > 0) {
+ struct slabinfo t;
+
+ memcpy(&t, s1, sizeof(struct slabinfo));
+ memcpy(s1, s2, sizeof(struct slabinfo));
+ memcpy(s2, &t, sizeof(struct slabinfo));
+ }
+ }
+ }
+}
+
+void sort_aliases(void)
+{
+ struct aliasinfo *a1,*a2;
+
+ for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) {
+ for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) {
+ char *n1, *n2;
+
+ n1 = a1->name;
+ n2 = a2->name;
+ if (show_alias && !show_inverted) {
+ n1 = a1->ref;
+ n2 = a2->ref;
+ }
+ if (strcasecmp(n1, n2) > 0) {
+ struct aliasinfo t;
+
+ memcpy(&t, a1, sizeof(struct aliasinfo));
+ memcpy(a1, a2, sizeof(struct aliasinfo));
+ memcpy(a2, &t, sizeof(struct aliasinfo));
+ }
+ }
+ }
+}
+
+void link_slabs(void)
+{
+ struct aliasinfo *a;
+ struct slabinfo *s;
+
+ for (a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+ for(s = slabinfo; s < slabinfo + slabs; s++)
+ if (strcmp(a->ref, s->name) == 0) {
+ a->slab = s;
+ s->refs++;
+ break;
+ }
+ if (s == slabinfo + slabs)
+ fatal("Unresolved alias %s\n", a->ref);
+ }
+}
+
+void alias(void)
+{
+ struct aliasinfo *a;
+ char *active = NULL;
+
+ sort_aliases();
+ link_slabs();
+
+ for(a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+ if (!show_single_ref && a->slab->refs == 1)
+ continue;
+
+ if (!show_inverted) {
+ if (active) {
+ if (strcmp(a->slab->name, active) == 0) {
+ printf(" %s", a->name);
+ continue;
+ }
+ }
+ printf("\n%-20s <- %s", a->slab->name, a->name);
+ active = a->slab->name;
+ }
+ else
+ printf("%-20s -> %s\n", a->name, a->slab->name);
+ }
+ if (active)
+ printf("\n");
+}
+
+
+void rename_slabs(void)
+{
+ struct slabinfo *s;
+ struct aliasinfo *a;
+
+ for (s = slabinfo; s < slabinfo + slabs; s++) {
+ if (*s->name != ':')
+ continue;
+
+ if (s->refs > 1 && !show_first_alias)
+ continue;
+
+ a = find_one_alias(s);
+
+ s->name = a->name;
+ }
+}
+
+int slab_mismatch(char *slab)
+{
+ return regexec(&pattern, slab, 0, NULL, 0);
+}
+
+void read_slab_dir(void)
+{
+ DIR *dir;
+ struct dirent *de;
+ struct slabinfo *slab = slabinfo;
+ struct aliasinfo *alias = aliasinfo;
+ char *p;
+ char *t;
+ int count;
+
+ dir = opendir(".");
+ while ((de = readdir(dir))) {
+ if (de->d_name[0] == '.' ||
+ slab_mismatch(de->d_name))
+ continue;
+ switch (de->d_type) {
+ case DT_LNK:
+ alias->name = strdup(de->d_name);
+ count = readlink(de->d_name, buffer, sizeof(buffer));
+
+ if (count < 0)
+ fatal("Cannot read symlink %s\n", de->d_name);
+
+ buffer[count] = 0;
+ p = buffer + count;
+ while (p > buffer && p[-1] != '/')
+ p--;
+ alias->ref = strdup(p);
+ alias++;
+ break;
+ case DT_DIR:
+ if (chdir(de->d_name))
+ fatal("Unable to access slab %s\n", slab->name);
+ slab->name = strdup(de->d_name);
+ slab->alias = 0;
+ slab->refs = 0;
+ slab->aliases = get_obj("aliases");
+ slab->align = get_obj("align");
+ slab->cache_dma = get_obj("cache_dma");
+ slab->cpu_slabs = get_obj("cpu_slabs");
+ slab->destroy_by_rcu = get_obj("destroy_by_rcu");
+ slab->hwcache_align = get_obj("hwcache_align");
+ slab->object_size = get_obj("object_size");
+ slab->objects = get_obj("objects");
+ slab->objs_per_slab = get_obj("objs_per_slab");
+ slab->order = get_obj("order");
+ slab->partial = get_obj("partial");
+ slab->partial = get_obj_and_str("partial", &t);
+ decode_numa_list(slab->numa_partial, t);
+ slab->poison = get_obj("poison");
+ slab->reclaim_account = get_obj("reclaim_account");
+ slab->red_zone = get_obj("red_zone");
+ slab->sanity_checks = get_obj("sanity_checks");
+ slab->slab_size = get_obj("slab_size");
+ slab->slabs = get_obj_and_str("slabs", &t);
+ decode_numa_list(slab->numa, t);
+ slab->store_user = get_obj("store_user");
+ slab->trace = get_obj("trace");
+ chdir("..");
+ if (slab->name[0] == ':')
+ alias_targets++;
+ slab++;
+ break;
+ default :
+ fatal("Unknown file type %lx\n", de->d_type);
+ }
+ }
+ closedir(dir);
+ slabs = slab - slabinfo;
+ aliases = alias - aliasinfo;
+ if (slabs > MAX_SLABS)
+ fatal("Too many slabs\n");
+ if (aliases > MAX_ALIASES)
+ fatal("Too many aliases\n");
+}
+
+void output_slabs(void)
+{
+ struct slabinfo *slab;
+
+ for (slab = slabinfo; slab < slabinfo + slabs; slab++) {
+
+ if (slab->alias)
+ continue;
+
+
+ if (show_numa)
+ slab_numa(slab);
+ else
+ if (show_track)
+ show_tracking(slab);
+ else
+ if (validate)
+ slab_validate(slab);
+ else
+ if (shrink)
+ slab_shrink(slab);
+ else {
+ if (show_slab)
+ slabcache(slab);
+ }
+ }
+}
+
+struct option opts[] = {
+ { "aliases", 0, NULL, 'a' },
+ { "slabs", 0, NULL, 'l' },
+ { "numa", 0, NULL, 'n' },
+ { "zero", 0, NULL, 'z' },
+ { "help", 0, NULL, 'h' },
+ { "validate", 0, NULL, 'v' },
+ { "first-alias", 0, NULL, 'f' },
+ { "shrink", 0, NULL, 's' },
+ { "track", 0, NULL, 't'},
+ { "inverted", 0, NULL, 'i'},
+ { "1ref", 0, NULL, '1'},
+ { NULL, 0, NULL, 0 }
+};
+
+int main(int argc, char *argv[])
+{
+ int c;
+ int err;
+ char *pattern_source;
+
+ page_size = getpagesize();
+ if (chdir("/sys/slab"))
+ fatal("This kernel does not have SLUB support.\n");
+
+ while ((c = getopt_long(argc, argv, "afhil1npstvzTS", opts, NULL)) != -1)
+ switch(c) {
+ case '1':
+ show_single_ref = 1;
+ break;
+ case 'a':
+ show_alias = 1;
+ break;
+ case 'f':
+ show_first_alias = 1;
+ break;
+ case 'h':
+ usage();
+ return 0;
+ case 'i':
+ show_inverted = 1;
+ break;
+ case 'n':
+ show_numa = 1;
+ break;
+ case 's':
+ shrink = 1;
+ break;
+ case 'l':
+ show_slab = 1;
+ break;
+ case 't':
+ show_track = 1;
+ break;
+ case 'v':
+ validate = 1;
+ break;
+ case 'z':
+ skip_zero = 0;
+ break;
+ case 'T':
+ show_totals = 1;
+ break;
+ case 'S':
+ sort_size = 1;
+ break;
+
+ default:
+ fatal("%s: Invalid option '%c'\n", argv[0], optopt);
+
+ }
+
+ if (!show_slab && !show_alias && !show_track
+ && !validate && !shrink)
+ show_slab = 1;
+
+ if (argc > optind)
+ pattern_source = argv[optind];
+ else
+ pattern_source = ".*";
+
+ err = regcomp(&pattern, pattern_source, REG_ICASE|REG_NOSUB);
+ if (err)
+ fatal("%s: Invalid pattern '%s' code %d\n",
+ argv[0], pattern_source, err);
+ read_slab_dir();
+ if (show_alias)
+ alias();
+ else
+ if (show_totals)
+ totals();
+ else {
+ link_slabs();
+ rename_slabs();
+ sort_slabs();
+ output_slabs();
+ }
+ return 0;
+}
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt
new file mode 100644
index 0000000..727c8d8
--- /dev/null
+++ b/Documentation/vm/slub.txt
@@ -0,0 +1,113 @@
+Short users guide for SLUB
+--------------------------
+
+First of all slub should transparently replace SLAB. If you enable
+SLUB then everything should work the same (Note the word "should".
+There is likely not much value in that word at this point).
+
+The basic philosophy of SLUB is very different from SLAB. SLAB
+requires rebuilding the kernel to activate debug options for all
+SLABS. SLUB always includes full debugging but its off by default.
+SLUB can enable debugging only for selected slabs in order to avoid
+an impact on overall system performance which may make a bug more
+difficult to find.
+
+In order to switch debugging on one can add a option "slub_debug"
+to the kernel command line. That will enable full debugging for
+all slabs.
+
+Typically one would then use the "slabinfo" command to get statistical
+data and perform operation on the slabs. By default slabinfo only lists
+slabs that have data in them. See "slabinfo -h" for more options when
+running the command. slabinfo can be compiled with
+
+gcc -o slabinfo Documentation/vm/slabinfo.c
+
+Some of the modes of operation of slabinfo require that slub debugging
+be enabled on the command line. F.e. no tracking information will be
+available without debugging on and validation can only partially
+be performed if debugging was not switched on.
+
+Some more sophisticated uses of slub_debug:
+-------------------------------------------
+
+Parameters may be given to slub_debug. If none is specified then full
+debugging is enabled. Format:
+
+slub_debug=<Debug-Options> Enable options for all slabs
+slub_debug=<Debug-Options>,<slab name>
+ Enable options only for select slabs
+
+Possible debug options are
+ F Sanity checks on (enables SLAB_DEBUG_FREE. Sorry
+ SLAB legacy issues)
+ Z Red zoning
+ P Poisoning (object and padding)
+ U User tracking (free and alloc)
+ T Trace (please only use on single slabs)
+
+F.e. in order to boot just with sanity checks and red zoning one would specify:
+
+ slub_debug=FZ
+
+Trying to find an issue in the dentry cache? Try
+
+ slub_debug=,dentry_cache
+
+to only enable debugging on the dentry cache.
+
+Red zoning and tracking may realign the slab. We can just apply sanity checks
+to the dentry cache with
+
+ slub_debug=F,dentry_cache
+
+In case you forgot to enable debugging on the kernel command line: It is
+possible to enable debugging manually when the kernel is up. Look at the
+contents of:
+
+/sys/slab/<slab name>/
+
+Look at the writable files. Writing 1 to them will enable the
+corresponding debug option. All options can be set on a slab that does
+not contain objects. If the slab already contains objects then sanity checks
+and tracing may only be enabled. The other options may cause the realignment
+of objects.
+
+Careful with tracing: It may spew out lots of information and never stop if
+used on the wrong slab.
+
+SLAB Merging
+------------
+
+If no debugging is specified then SLUB may merge similar slabs together
+in order to reduce overhead and increase cache hotness of objects.
+slabinfo -a displays which slabs were merged together.
+
+Getting more performance
+------------------------
+
+To some degree SLUB's performance is limited by the need to take the
+list_lock once in a while to deal with partial slabs. That overhead is
+governed by the order of the allocation for each slab. The allocations
+can be influenced by kernel parameters:
+
+slub_min_objects=x (default 8)
+slub_min_order=x (default 0)
+slub_max_order=x (default 4)
+
+slub_min_objects allows to specify how many objects must at least fit
+into one slab in order for the allocation order to be acceptable.
+In general slub will be able to perform this number of allocations
+on a slab without consulting centralized resources (list_lock) where
+contention may occur.
+
+slub_min_order specifies a minim order of slabs. A similar effect like
+slub_min_objects.
+
+slub_max_order specified the order at which slub_min_objects should no
+longer be checked. This is useful to avoid SLUB trying to generate
+super large order pages to fit slub_min_objects of a slab cache with
+large object sizes into one high order page.
+
+
+Christoph Lameter, <clameter@sgi.com>, April 10, 2007