Merge branch 'cm-13.0' of https://github.com/CyanogenMod/android_system_core into replicant-6.0replicant-6.0-alpha-0004

author: Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de> 2016-03-18 10:16:43 +0100
committer: Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de> 2016-03-18 10:16:43 +0100
commit: 7f2bb91aecbdc8af268c56f80034d8f3dc0204aa (patch)
tree: 59506f762d56cb9a31856509496553707fcfd6a9
parent: 6309ce85786ff96bd04f62dd5cb62a7b0c69f763 (diff)
parent: fb7e634f3d725728d65e0c32c5d20c99d91a158e (diff)
download: system_core-7f2bb91aecbdc8af268c56f80034d8f3dc0204aa.zip
system_core-7f2bb91aecbdc8af268c56f80034d8f3dc0204aa.tar.gz
system_core-7f2bb91aecbdc8af268c56f80034d8f3dc0204aa.tar.bz2
47 files changed, 15736 insertions, 73 deletions
diff --git a/adb/adb.cpp b/adb/adb.cpp
index 0ff9e3c..9f9e6bd 100644
--- a/adb/adb.cpp
+++ b/adb/adb.cpp
@@ -919,7 +919,7 @@ int handle_host_request(char *service, transport_type ttype, char* serial, int r
     if(!strncmp(service,"get-state",strlen("get-state"))) {
         transport = acquire_one_transport(CS_ANY, ttype, serial, NULL);
         SendOkay(reply_fd);
-        SendProtocolString(reply_fd, transport->connection_state_name());
+        SendProtocolString(reply_fd, transport ? transport->connection_state_name() : "unknown");
         return 0;
     }
 #endif // ADB_HOST
diff --git a/adb/commandline.cpp b/adb/commandline.cpp
index cd635ce..eb0c84b 100644
--- a/adb/commandline.cpp
+++ b/adb/commandline.cpp
@@ -756,8 +756,10 @@ static int logcat(transport_type transport, const char* serial, int argc, const
 
 static int mkdirs(const char *path)
 {
+    std::string holder(path);
+
     int ret;
-    char *x = (char *)path + 1;
+    char *x = &holder[1];
 
     for(;;) {
         x = adb_dirstart(x);
@@ -774,7 +776,7 @@ static int mkdirs(const char *path)
 }
 
 static int backup(int argc, const char** argv) {
-    const char* filename = "./backup.ab";
+    const char* filename = "backup.ab";
 
     /* find, extract, and use any -f argument */
     for (int i = 1; i < argc; i++) {
diff --git a/fastboot/fastboot.cpp b/fastboot/fastboot.cpp
index ec3b84a..5112c15 100644
--- a/fastboot/fastboot.cpp
+++ b/fastboot/fastboot.cpp
@@ -279,7 +279,7 @@ void usage(void)
             "usage: fastboot [ <option> ] <command>\n"
             "\n"
             "commands:\n"
-            "  update <filename>                        reflash device from update.zip\n"
+            "  update <filename> [ -a <boot.img> ]      reflash device from update.zip\n"
             "  flashall                                 flash boot, system, vendor and if found,\n"
             "                                           recovery\n"
             "  flash <partition> [ <filename> ]         write a file to a flash partition\n"
@@ -329,6 +329,9 @@ void usage(void)
             "                                           default: 2048\n"
             "  -S <size>[K|M|G]                         automatically sparse files greater\n"
             "                                           than size.  0 to disable\n"
+            "  -R                                       reboot device (e.g. after flash)\n"
+            "  -a <boot.img>                            use alternate <boot.img> instead of\n"
+            "                                           boot.img in update.zip file\n"
         );
 }
 
@@ -740,7 +743,7 @@ void do_update_signature(ZipArchiveHandle zip, char *fn)
     fb_queue_command("signature", "installing signature");
 }
 
-void do_update(usb_handle *usb, const char *filename, int erase_first)
+void do_update(usb_handle *usb, const char *filename, int erase_first, const char *alt_boot_fname)
 {
     queue_info_dump();
 
@@ -763,18 +766,27 @@ void do_update(usb_handle *usb, const char *filename, int erase_first)
     setup_requirements(reinterpret_cast<char*>(data), sz);
 
     for (size_t i = 0; i < ARRAY_SIZE(images); ++i) {
-        int fd = unzip_to_file(zip, images[i].img_name);
-        if (fd == -1) {
-            if (images[i].is_optional) {
-                continue;
+        fastboot_buffer buf;
+        // support alt images only for boot partition
+        bool from_zip = alt_boot_fname == NULL ||
+                strncmp(images[i].part_name, "boot", sizeof(images[i].part_name));
+        if (from_zip) {
+            int fd = unzip_to_file(zip, images[i].img_name);
+            if (fd == -1) {
+                if (images[i].is_optional) {
+                    continue;
+                }
+                CloseArchive(zip);
+                exit(1); // unzip_to_file already explained why.
             }
-            CloseArchive(zip);
-            exit(1); // unzip_to_file already explained why.
+            int rc = load_buf_fd(usb, fd, &buf);
+            if (rc) die("cannot load %s from flash", images[i].img_name);
+            do_update_signature(zip, images[i].sig_name);
+        } else {
+            int rc = load_buf(usb, alt_boot_fname, &buf);
+            if (rc) die("cannot load %s", alt_boot_fname);
         }
-        fastboot_buffer buf;
-        int rc = load_buf_fd(usb, fd, &buf);
-        if (rc) die("cannot load %s from flash", images[i].img_name);
-        do_update_signature(zip, images[i].sig_name);
+
         if (erase_first && needs_erase(usb, images[i].part_name)) {
             fb_queue_erase(images[i].part_name);
         }
@@ -1017,8 +1029,10 @@ int main(int argc, char **argv)
     int status;
     int c;
     int longindex;
+    const char *alt_boot_fname = NULL;
 
     const struct option longopts[] = {
+        {"alt-boot", required_argument, 0, 'a'},
         {"base", required_argument, 0, 'b'},
         {"kernel_offset", required_argument, 0, 'k'},
         {"page_size", required_argument, 0, 'n'},
@@ -1027,18 +1041,22 @@ int main(int argc, char **argv)
         {"help", no_argument, 0, 'h'},
         {"unbuffered", no_argument, 0, 0},
         {"version", no_argument, 0, 0},
+        {"reboot", no_argument, 0, 'R'},
         {0, 0, 0, 0}
     };
 
     serial = getenv("ANDROID_SERIAL");
 
     while (1) {
-        c = getopt_long(argc, argv, "wub:k:n:r:t:s:S:lp:c:i:m:h", longopts, &longindex);
+        c = getopt_long(argc, argv, "wub:k:n:r:t:s:S:lp:c:i:m:hRa:", longopts, &longindex);
         if (c < 0) {
             break;
         }
         /* Alphabetical cases */
         switch (c) {
+        case 'a':
+            alt_boot_fname = optarg;
+            break;
         case 'b':
             base_addr = strtoul(optarg, 0, 16);
             break;
@@ -1074,6 +1092,9 @@ int main(int argc, char **argv)
         case 'r':
             ramdisk_offset = strtoul(optarg, 0, 16);
             break;
+        case 'R':
+            wants_reboot = 1;
+            break;
         case 't':
             tags_offset = strtoul(optarg, 0, 16);
             break;
@@ -1251,10 +1272,10 @@ int main(int argc, char **argv)
             wants_reboot = 1;
         } else if(!strcmp(*argv, "update")) {
             if (argc > 1) {
-                do_update(usb, argv[1], erase_first);
+                do_update(usb, argv[1], erase_first, alt_boot_fname);
                 skip(2);
             } else {
-                do_update(usb, "update.zip", erase_first);
+                do_update(usb, "update.zip", erase_first, alt_boot_fname);
                 skip(1);
             }
             wants_reboot = 1;
diff --git a/fs_mgr/fs_mgr_format.c b/fs_mgr/fs_mgr_format.c
index e932990..8bda19c 100644
--- a/fs_mgr/fs_mgr_format.c
+++ b/fs_mgr/fs_mgr_format.c
@@ -33,7 +33,7 @@ extern void reset_ext4fs_info();
 
 static int format_ext4(char *fs_blkdev, char *fs_mnt_point, long long fs_length)
 {
-    unsigned int nr_sec;
+    uint64_t dev_sz;
     int fd, rc = 0;
 
     if ((fd = open(fs_blkdev, O_WRONLY, 0644)) < 0) {
@@ -41,7 +41,7 @@ static int format_ext4(char *fs_blkdev, char *fs_mnt_point, long long fs_length)
         return -1;
     }
 
-    if ((ioctl(fd, BLKGETSIZE, &nr_sec)) == -1) {
+    if ((ioctl(fd, BLKGETSIZE64, &dev_sz)) == -1) {
         ERROR("Cannot get block device size.  %s\n", strerror(errno));
         close(fd);
         return -1;
@@ -49,7 +49,7 @@ static int format_ext4(char *fs_blkdev, char *fs_mnt_point, long long fs_length)
 
     /* Format the partition using the calculated length */
     reset_ext4fs_info();
-    info.len = ((off64_t)nr_sec * 512);
+    info.len = (off64_t)dev_sz;
 
     if (fs_length > 0) {
         info.len = fs_length;
diff --git a/healthd/Android.mk b/healthd/Android.mk
index e5fffc0..d1e005e 100644
--- a/healthd/Android.mk
+++ b/healthd/Android.mk
@@ -9,6 +9,12 @@ LOCAL_CFLAGS := -Werror
 include $(BUILD_STATIC_LIBRARY)
 
 include $(CLEAR_VARS)
+LOCAL_SRC_FILES := healthd_board_msm.cpp
+LOCAL_MODULE := libhealthd.msm
+LOCAL_CFLAGS := -Werror
+include $(BUILD_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := \
 	healthd.cpp \
@@ -55,6 +61,10 @@ endif
 
 LOCAL_HAL_STATIC_LIBRARIES := libhealthd
 
+ifeq ($(BOARD_USES_QCOM_HARDWARE),true)
+BOARD_HAL_STATIC_LIBRARIES ?= libhealthd.msm
+endif
+
 # Symlink /charger to /sbin/healthd
 LOCAL_POST_INSTALL_CMD := $(hide) mkdir -p $(TARGET_ROOT_OUT) \
     && rm -f $(TARGET_ROOT_OUT)/charger && ln -sf /sbin/healthd $(TARGET_ROOT_OUT)/charger
diff --git a/healthd/healthd_board_msm.cpp b/healthd/healthd_board_msm.cpp
new file mode 100644
index 0000000..d17095f
--- /dev/null
+++ b/healthd/healthd_board_msm.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <cutils/android_reboot.h>
+#include <cutils/klog.h>
+#include <cutils/misc.h>
+#include <cutils/uevent.h>
+#include <cutils/properties.h>
+
+#include <pthread.h>
+#include <linux/android_alarm.h>
+#include <sys/timerfd.h>
+#include <linux/rtc.h>
+
+#include <healthd.h>
+
+#define LOGE(x...) do { KLOG_ERROR("charger", x); } while (0)
+#define LOGI(x...) do { KLOG_INFO("charger", x); } while (0)
+#define LOGV(x...) do { KLOG_DEBUG("charger", x); } while (0)
+
+enum alarm_time_type {
+    ALARM_TIME,
+    RTC_TIME,
+};
+
+/*
+ * shouldn't be changed after
+ * reading from alarm register
+ */
+static time_t alm_secs;
+
+static int alarm_get_time(enum alarm_time_type time_type,
+                          time_t *secs)
+{
+    struct tm tm;
+    unsigned int cmd;
+    int rc, fd = -1;
+
+    if (!secs)
+        return -1;
+
+    fd = open("/dev/rtc0", O_RDONLY);
+    if (fd < 0) {
+        LOGE("Can't open rtc devfs node\n");
+        return -1;
+    }
+
+    switch (time_type) {
+        case ALARM_TIME:
+            cmd = RTC_ALM_READ;
+            break;
+        case RTC_TIME:
+            cmd = RTC_RD_TIME;
+            break;
+        default:
+            LOGE("Invalid time type\n");
+            goto err;
+    }
+
+    rc = ioctl(fd, cmd, &tm);
+    if (rc < 0) {
+        LOGE("Unable to get time\n");
+        goto err;
+    }
+
+    *secs = mktime(&tm) + tm.tm_gmtoff;
+    if (*secs < 0) {
+        LOGE("Invalid seconds = %ld\n", *secs);
+        goto err;
+    }
+
+    close(fd);
+    return 0;
+
+err:
+    close(fd);
+    return -1;
+}
+
+#define ERR_SECS 2
+static int alarm_is_alm_expired()
+{
+    int rc;
+    time_t rtc_secs;
+
+    rc = alarm_get_time(RTC_TIME, &rtc_secs);
+    if (rc < 0)
+        return 0;
+
+    return (alm_secs >= rtc_secs - ERR_SECS &&
+            alm_secs <= rtc_secs + ERR_SECS) ? 1 : 0;
+}
+
+static int timerfd_set_reboot_time_and_wait(time_t secs)
+{
+    int fd;
+    int ret = -1;
+    fd = timerfd_create(CLOCK_REALTIME_ALARM, 0);
+    if (fd < 0) {
+        LOGE("Can't open timerfd alarm node\n");
+        goto err_return;
+    }
+
+    struct itimerspec spec;
+    memset(&spec, 0, sizeof(spec));
+    spec.it_value.tv_sec = secs;
+
+    if (timerfd_settime(fd, 0 /* relative */, &spec, NULL)) {
+        LOGE("Can't set timerfd alarm\n");
+        goto err_close;
+    }
+
+    uint64_t unused;
+    if (read(fd, &unused, sizeof(unused)) < 0) {
+       LOGE("Wait alarm error\n");
+       goto err_close;
+    }
+
+    ret = 0;
+err_close:
+    close(fd);
+err_return:
+    return ret;
+}
+
+static int alarm_set_reboot_time_and_wait(time_t secs)
+{
+    int rc, fd;
+    struct timespec ts;
+
+    fd = open("/dev/alarm", O_RDWR);
+    if (fd < 0) {
+        LOGE("Can't open alarm devfs node, trying timerfd\n");
+        return timerfd_set_reboot_time_and_wait(secs);
+    }
+
+    /* get the elapsed realtime from boot time to now */
+    rc = ioctl(fd, ANDROID_ALARM_GET_TIME(
+                      ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP), &ts);
+    if (rc < 0) {
+        LOGE("Unable to get elapsed realtime\n");
+        goto err;
+    }
+
+    /* calculate the elapsed time from boot time to reboot time */
+    ts.tv_sec += secs;
+    ts.tv_nsec = 0;
+
+    rc = ioctl(fd, ANDROID_ALARM_SET(
+                      ANDROID_ALARM_ELAPSED_REALTIME_WAKEUP), &ts);
+    if (rc < 0) {
+        LOGE("Unable to set reboot time to %ld\n", secs);
+        goto err;
+    }
+
+    do {
+        rc = ioctl(fd, ANDROID_ALARM_WAIT);
+    } while ((rc < 0 && errno == EINTR) || !alarm_is_alm_expired());
+
+    if (rc <= 0) {
+        LOGE("Unable to wait on alarm\n");
+        goto err;
+    }
+
+    close(fd);
+    return 0;
+
+err:
+    if (fd >= 0)
+        close(fd);
+    return -1;
+}
+
+static void *alarm_thread(void *)
+{
+    time_t rtc_secs, rb_secs;
+    int rc;
+
+    /*
+     * to support power off alarm, the time
+     * stored in alarm register at latest
+     * shutdown time should be some time
+     * earlier than the actual alarm time
+     * set by user
+     */
+    rc = alarm_get_time(ALARM_TIME, &alm_secs);
+    if (rc < 0 || !alm_secs)
+        goto err;
+
+    rc = alarm_get_time(RTC_TIME, &rtc_secs);
+    if (rc < 0)
+        goto err;
+
+    /*
+     * calculate the reboot time after which
+     * the phone will reboot
+     */
+    rb_secs = alm_secs - rtc_secs;
+    if (rb_secs <= 0)
+        goto err;
+
+    rc = alarm_set_reboot_time_and_wait(rb_secs);
+    if (rc < 0)
+        goto err;
+
+    LOGI("Exit from power off charging, reboot the phone!\n");
+    android_reboot(ANDROID_RB_RESTART, 0, 0);
+
+err:
+    LOGE("Exit from alarm thread\n");
+    return NULL;
+}
+
+void healthd_board_init(struct healthd_config*)
+{
+    pthread_t tid;
+    int rc;
+    char value[PROP_VALUE_MAX];
+
+    property_get("ro.bootmode", value, "");
+    if (!strcmp("charger", value)) {
+        rc = pthread_create(&tid, NULL, alarm_thread, NULL);
+        if (rc < 0)
+            LOGE("Create alarm thread failed\n");
+    }
+}
+
+int healthd_board_battery_update(struct android::BatteryProperties*)
+{
+    // return 0 to log periodic polled battery status to kernel log
+    return 1;
+}
+
+void healthd_board_mode_charger_draw_battery(struct android::BatteryProperties*)
+{
+
+}
+
+void healthd_board_mode_charger_battery_update(struct android::BatteryProperties*)
+{
+
+}
+
+void healthd_board_mode_charger_set_backlight(bool)
+{
+
+}
+
+void healthd_board_mode_charger_init()
+{
+
+}
diff --git a/healthd/healthd_mode_charger.cpp b/healthd/healthd_mode_charger.cpp
index 1b6e216..eb93d6a 100644
--- a/healthd/healthd_mode_charger.cpp
+++ b/healthd/healthd_mode_charger.cpp
@@ -156,7 +156,7 @@ static struct frame batt_anim_frames[] = {
     {
         .disp_time = 750,
         .min_capacity = 80,
-        .level_only = true,
+        .level_only = false,
         .surface = NULL,
     },
     {
diff --git a/init/Android.mk b/init/Android.mk
index 159c4f2..aa32236 100644
--- a/init/Android.mk
+++ b/init/Android.mk
@@ -85,7 +85,6 @@ LOCAL_STATIC_LIBRARIES := \
     liblogwrap \
     libcutils \
     libbase \
-    libext4_utils_static \
     libutils \
     liblog \
     libc \
diff --git a/init/builtins.cpp b/init/builtins.cpp
index e3e64f4..b290ce3 100644
--- a/init/builtins.cpp
+++ b/init/builtins.cpp
@@ -585,6 +585,9 @@ int do_powerctl(int nargs, char **args)
     }
 
     if (strncmp(command, "shutdown", 8) == 0) {
+        if (property_get_bool("init.shutdown_to_charging", false)) {
+            return android_reboot(ANDROID_RB_RESTART2, 0, "charging");
+        }
         cmd = ANDROID_RB_POWEROFF;
         len = 8;
     } else if (strncmp(command, "reboot", 6) == 0) {
diff --git a/init/property_service.cpp b/init/property_service.cpp
index 11ff06b..fe82bef 100644
--- a/init/property_service.cpp
+++ b/init/property_service.cpp
@@ -150,6 +150,33 @@ int __property_get(const char *name, char *value)
     return __system_property_get(name, value);
 }
 
+bool property_get_bool(const char *key, bool default_value) {
+    if (!key) {
+        return default_value;
+    }
+
+    bool result = default_value;
+    char buf[PROP_VALUE_MAX] = {'\0',};
+
+    int len = __property_get(key, buf);
+    if (len == 1) {
+        char ch = buf[0];
+        if (ch == '0' || ch == 'n') {
+            result = false;
+        } else if (ch == '1' || ch == 'y') {
+            result = true;
+        }
+    } else if (len > 1) {
+         if (!strcmp(buf, "no") || !strcmp(buf, "false") || !strcmp(buf, "off")) {
+            result = false;
+        } else if (!strcmp(buf, "yes") || !strcmp(buf, "true") || !strcmp(buf, "on")) {
+            result = true;
+        }
+    }
+
+    return result;
+}
+
 static void write_persistent_property(const char *name, const char *value)
 {
     char tempPath[PATH_MAX];
diff --git a/init/property_service.h b/init/property_service.h
index 303f251..6b542b5 100644
--- a/init/property_service.h
+++ b/init/property_service.h
@@ -28,6 +28,7 @@ extern void start_property_service(void);
 void get_property_workspace(int *fd, int *sz);
 extern int __property_get(const char *name, char *value);
 extern int property_set(const char *name, const char *value);
+extern bool property_get_bool(const char *name, bool def_value);
 extern bool properties_initialized();
 
 #ifndef __clang__
diff --git a/libcutils/sched_policy.c b/libcutils/sched_policy.c
index 83222f4..70dc8c4 100644
--- a/libcutils/sched_policy.c
+++ b/libcutils/sched_policy.c
@@ -61,6 +61,7 @@ static int bg_cgroup_fd = -1;
 static int fg_cgroup_fd = -1;
 
 // File descriptors open to /dev/cpuset/../tasks, setup by initialize, or -1 on error
+static int system_bg_cpuset_fd = -1;
 static int bg_cpuset_fd = -1;
 static int fg_cpuset_fd = -1;
 
@@ -126,6 +127,8 @@ static void __initialize(void) {
         fg_cpuset_fd = open(filename, O_WRONLY | O_CLOEXEC);
         filename = "/dev/cpuset/background/tasks";
         bg_cpuset_fd = open(filename, O_WRONLY | O_CLOEXEC);
+        filename = "/dev/cpuset/system-background/tasks";
+        system_bg_cpuset_fd = open(filename, O_WRONLY | O_CLOEXEC);
     }
 #endif
 
@@ -260,6 +263,9 @@ int set_cpuset_policy(int tid, SchedPolicy policy)
     case SP_AUDIO_SYS:
         fd = fg_cpuset_fd;
         break;
+    case SP_SYSTEM:
+        fd = system_bg_cpuset_fd;
+        break;
     default:
         fd = -1;
         break;
diff --git a/liblog/Android.mk b/liblog/Android.mk
index 115dd79..6714498 100644
--- a/liblog/Android.mk
+++ b/liblog/Android.mk
@@ -25,11 +25,13 @@ include $(CLEAR_VARS)
 liblog_cflags := -DLIBLOG_LOG_TAG=1005
 
 ifneq ($(TARGET_USES_LOGD),false)
-liblog_sources := logd_write.c log_event_write.c
+liblog_sources := logd_write.c
 else
 liblog_sources := logd_write_kern.c
 endif
 
+liblog_sources += log_event_write.c
+
 # some files must not be compiled when building against Mingw
 # they correspond to features not used by our host development tools
 # which are also hard or even impossible to port to native Win32
diff --git a/libpixelflinger/Android.mk b/libpixelflinger/Android.mk
index f02da7f..e6c9094 100644
--- a/libpixelflinger/Android.mk
+++ b/libpixelflinger/Android.mk
@@ -7,9 +7,16 @@ include $(CLEAR_VARS)
 
 include $(CLEAR_VARS)
 PIXELFLINGER_SRC_FILES:= \
+	codeflinger/CodeCache.cpp \
+	format.cpp \
+	clear.cpp \
+	raster.cpp \
+	buffer.cpp
+
+ifeq ($(filter x86%,$(TARGET_ARCH)),)
+PIXELFLINGER_SRC_FILES += \
 	codeflinger/ARMAssemblerInterface.cpp \
 	codeflinger/ARMAssemblerProxy.cpp \
-	codeflinger/CodeCache.cpp \
 	codeflinger/GGLAssembler.cpp \
 	codeflinger/load_store.cpp \
 	codeflinger/blending.cpp \
@@ -19,10 +26,8 @@ PIXELFLINGER_SRC_FILES:= \
 	pixelflinger.cpp.arm \
 	trap.cpp.arm \
 	scanline.cpp.arm \
-	format.cpp \
-	clear.cpp \
-	raster.cpp \
-	buffer.cpp
+
+endif
 
 PIXELFLINGER_CFLAGS := -fstrict-aliasing -fomit-frame-pointer
 
@@ -43,6 +48,18 @@ PIXELFLINGER_SRC_FILES_arm64 := \
 	arch-arm64/col32cb16blend.S \
 	arch-arm64/t32cb16blend.S \
 
+PIXELFLINGER_SRC_FILES_x86 := \
+	codeflinger/x86/X86Assembler.cpp \
+	codeflinger/x86/GGLX86Assembler.cpp \
+	codeflinger/x86/load_store.cpp \
+	codeflinger/x86/blending.cpp \
+	codeflinger/x86/texturing.cpp \
+	fixed.cpp \
+	picker.cpp \
+	pixelflinger.cpp \
+	trap.cpp \
+	scanline.cpp
+
 ifndef ARCH_MIPS_REV6
 PIXELFLINGER_SRC_FILES_mips := \
 	codeflinger/MIPSAssembler.cpp \
@@ -58,12 +75,16 @@ LOCAL_MODULE:= libpixelflinger
 LOCAL_SRC_FILES := $(PIXELFLINGER_SRC_FILES)
 LOCAL_SRC_FILES_arm := $(PIXELFLINGER_SRC_FILES_arm)
 LOCAL_SRC_FILES_arm64 := $(PIXELFLINGER_SRC_FILES_arm64)
+LOCAL_SRC_FILES_x86 := $(PIXELFLINGER_SRC_FILES_x86)
+LOCAL_SRC_FILES_x86_64 := $(PIXELFLINGER_SRC_FILES_x86)
 LOCAL_SRC_FILES_mips := $(PIXELFLINGER_SRC_FILES_mips)
 LOCAL_CFLAGS := $(PIXELFLINGER_CFLAGS)
 LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/include
 LOCAL_C_INCLUDES += $(LOCAL_EXPORT_C_INCLUDE_DIRS) \
 		    external/safe-iop/include
 LOCAL_SHARED_LIBRARIES := libcutils liblog libutils
+LOCAL_WHOLE_STATIC_LIBRARIES_x86 := libenc
+LOCAL_WHOLE_STATIC_LIBRARIES_x86_64 := libenc
 
 # Really this should go away entirely or at least not depend on
 # libhardware, but this at least gets us built.
diff --git a/libpixelflinger/codeflinger/Android.mk b/libpixelflinger/codeflinger/Android.mk
new file mode 100644
index 0000000..8004af7
--- /dev/null
+++ b/libpixelflinger/codeflinger/Android.mk
@@ -0,0 +1,3 @@
+ifneq ($(filter x86%,$(TARGET_ARCH)),)
+include $(call all-named-subdir-makefiles,x86/libenc)
+endif
diff --git a/libpixelflinger/codeflinger/x86/GGLX86Assembler.cpp b/libpixelflinger/codeflinger/x86/GGLX86Assembler.cpp
new file mode 100644
index 0000000..1b24503
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/GGLX86Assembler.cpp
@@ -0,0 +1,1507 @@
+/* libs/pixelflinger/codeflinger/x86/GGLX86Assembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "GGLX86Assembler"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <cutils/log.h>
+
+#include "codeflinger/x86/GGLX86Assembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+GGLX86Assembler::GGLX86Assembler(const sp<Assembly>& assembly)
+    : X86Assembler(assembly), X86RegisterAllocator(), mOptLevel(7)
+{
+}
+
+GGLX86Assembler::~GGLX86Assembler()
+{
+}
+
+void GGLX86Assembler::reset(int opt_level)
+{
+    X86Assembler::reset();
+    X86RegisterAllocator::reset();
+    mOptLevel = opt_level;
+}
+
+// ---------------------------------------------------------------------------
+
+int GGLX86Assembler::scanline(const needs_t& needs, context_t const* c)
+{
+    int err = 0;
+    err = scanline_core(needs, c);
+    if (err != 0)
+        ALOGE("scanline_core failed probably due to running out of the registers: %d\n", err);
+
+    // XXX: in theory, pcForLabel is not valid before generate()
+    char* fragment_start_pc = pcForLabel("fragment_loop");
+    char* fragment_end_pc = pcForLabel("fragment_end");
+    const int per_fragment_ins_size = int(fragment_end_pc - fragment_start_pc);
+
+    // build a name for our pipeline
+    char name[128];
+    sprintf(name,
+            "scanline__%08X:%08X_%08X_%08X [%3d ipp ins size]",
+            needs.p, needs.n, needs.t[0], needs.t[1], per_fragment_ins_size);
+
+    if (err) {
+        ALOGE("Error while generating ""%s""\n", name);
+        disassemble(name);
+        return -1;
+    }
+
+    return generate(name);
+}
+
+int GGLX86Assembler::scanline_core(const needs_t& needs, context_t const* c)
+{
+    int64_t duration = ggl_system_time();
+
+    mBlendFactorCached = 0;
+    mBlending = 0;
+    mMasking = 0;
+    mAA        = GGL_READ_NEEDS(P_AA, needs.p);
+    mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
+    mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
+    mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
+    mFog       = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
+    mSmooth    = GGL_READ_NEEDS(SHADE, needs.n) != 0;
+    mBuilderContext.needs = needs;
+    mBuilderContext.c = c;
+    mBuilderContext.Rctx = obtainReg(); //dynamically obtain if used and then immediately recycle it if not used
+    mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];
+
+    // ------------------------------------------------------------------------
+
+    decodeLogicOpNeeds(needs);
+
+    decodeTMUNeeds(needs, c);
+
+    mBlendSrc  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
+    mBlendDst  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
+    mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
+    mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));
+
+    if (!mCbFormat.c[GGLFormat::ALPHA].h) {
+        if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
+                (mBlendSrc == GGL_DST_ALPHA)) {
+            mBlendSrc = GGL_ONE;
+        }
+        if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
+                (mBlendSrcA == GGL_DST_ALPHA)) {
+            mBlendSrcA = GGL_ONE;
+        }
+        if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
+                (mBlendDst == GGL_DST_ALPHA)) {
+            mBlendDst = GGL_ONE;
+        }
+        if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
+                (mBlendDstA == GGL_DST_ALPHA)) {
+            mBlendDstA = GGL_ONE;
+        }
+    }
+
+    // if we need the framebuffer, read it now
+    const int blending =    blending_codes(mBlendSrc, mBlendDst) |
+                            blending_codes(mBlendSrcA, mBlendDstA);
+
+    // XXX: handle special cases, destination not modified...
+    if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
+            (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
+        // Destination unmodified (beware of logic ops)
+    } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
+               (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
+        // Destination is zero (beware of logic ops)
+    }
+
+    int fbComponents = 0;
+    const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
+    for (int i=0 ; i<4 ; i++) {
+        const int mask = 1<<i;
+        component_info_t& info = mInfo[i];
+        int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+        int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+        if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
+            fs = GGL_ONE;
+        info.masked =   !!(masking & mask);
+        info.inDest =   !info.masked && mCbFormat.c[i].h &&
+                        ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
+        if (mCbFormat.components >= GGL_LUMINANCE &&
+                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
+            info.inDest = false;
+        }
+        info.needed =   (i==GGLFormat::ALPHA) &&
+                        (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
+        info.replaced = !!(mTextureMachine.replaced & mask);
+        info.iterated = (!info.replaced && (info.inDest || info.needed));
+        info.smooth =   mSmooth && info.iterated;
+        info.fog =      mFog && info.inDest && (i != GGLFormat::ALPHA);
+        info.blend =    (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
+
+        mBlending |= (info.blend ? mask : 0);
+        mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
+        fbComponents |= mCbFormat.c[i].h ? mask : 0;
+    }
+
+    mAllMasked = (mMasking == fbComponents);
+    if (mAllMasked) {
+        mDithering = 0;
+    }
+
+    fragment_parts_t parts;
+
+    // ------------------------------------------------------------------------
+    callee_work();
+    // ------------------------------------------------------------------------
+
+    mCurSp = -12; // %ebx, %edi, %esi
+    prepare_esp(0);
+    build_scanline_preparation(parts, needs);
+    recycleReg(mBuilderContext.Rctx);
+
+    if (registerFile().status())
+        return registerFile().status();
+
+    // ------------------------------------------------------------------------
+    label("fragment_loop");
+    // ------------------------------------------------------------------------
+    {
+        Scratch regs(registerFile());
+        int temp_reg = -1;
+
+        if (mDithering) {
+            // update the dither index.
+            temp_reg = regs.obtain();
+            //To load to register and calculate should be fast than the memory operations
+            MOV_MEM_TO_REG(parts.count.offset_ebp, PhysicalReg_EBP, temp_reg);
+            ROR(GGL_DITHER_ORDER_SHIFT, temp_reg);
+            ADD_IMM_TO_REG(1 << (32 - GGL_DITHER_ORDER_SHIFT), temp_reg);
+            ROR(32 - GGL_DITHER_ORDER_SHIFT, temp_reg);
+            MOV_REG_TO_MEM(temp_reg, parts.count.offset_ebp, PhysicalReg_EBP);
+            regs.recycle(temp_reg);
+
+        }
+
+        // XXX: could we do an early alpha-test here in some cases?
+        // It would probaly be used only with smooth-alpha and no texture
+        // (or no alpha component in the texture).
+
+        // Early z-test
+        if (mAlphaTest==GGL_ALWAYS) {
+            build_depth_test(parts, Z_TEST|Z_WRITE);
+        } else {
+            // we cannot do the z-write here, because
+            // it might be killed by the alpha-test later
+            build_depth_test(parts, Z_TEST);
+        }
+
+        {   // texture coordinates
+            Scratch scratches(registerFile());
+
+            // texel generation
+            build_textures(parts, regs);
+
+        }
+
+        if ((blending & (FACTOR_DST|BLEND_DST)) ||
+                (mMasking && !mAllMasked) ||
+                (mLogicOp & LOGIC_OP_DST))
+        {
+            // blending / logic_op / masking need the framebuffer
+            mDstPixel.setTo(regs.obtain(), &mCbFormat);
+
+            // load the framebuffer pixel
+            comment("fetch color-buffer");
+            parts.cbPtr.reg = regs.obtain();
+            MOV_MEM_TO_REG(parts.cbPtr.offset_ebp, PhysicalReg_EBP, parts.cbPtr.reg);
+            load(parts.cbPtr, mDstPixel);
+            mCurSp = mCurSp - 4;
+            mDstPixel.offset_ebp = mCurSp;
+            MOV_REG_TO_MEM(mDstPixel.reg, mDstPixel.offset_ebp, EBP);
+            regs.recycle(mDstPixel.reg);
+            regs.recycle(parts.cbPtr.reg);
+            mDstPixel.reg = -1;
+        }
+
+        if (registerFile().status())
+            return registerFile().status();
+
+        pixel_t pixel;
+        int directTex = mTextureMachine.directTexture;
+        if (directTex | parts.packed) {
+            // note: we can't have both here
+            // iterated color or direct texture
+            if(directTex) {
+                pixel.offset_ebp = parts.texel[directTex-1].offset_ebp;
+            }
+            else
+                pixel.offset_ebp = parts.iterated.offset_ebp;
+            pixel.reg = regs.obtain();
+            MOV_MEM_TO_REG(pixel.offset_ebp, EBP, pixel.reg);
+            //pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
+            pixel.flags &= ~CORRUPTIBLE;
+        } else {
+            if (mDithering) {
+                mBuilderContext.Rctx = regs.obtain();
+                temp_reg = regs.obtain();
+                const int ctxtReg = mBuilderContext.Rctx;
+                MOV_MEM_TO_REG(8, EBP, ctxtReg);
+                const int mask = GGL_DITHER_SIZE-1;
+                parts.dither = reg_t(regs.obtain());
+                MOV_MEM_TO_REG(parts.count.offset_ebp, EBP, parts.dither.reg);
+                AND_IMM_TO_REG(mask, parts.dither.reg);
+                ADD_REG_TO_REG(ctxtReg, parts.dither.reg);
+                MOVZX_MEM_TO_REG(OpndSize_8, parts.dither.reg, GGL_OFFSETOF(ditherMatrix), temp_reg);
+                MOV_REG_TO_REG(temp_reg, parts.dither.reg);
+                mCurSp = mCurSp - 4;
+                parts.dither.offset_ebp = mCurSp;
+                MOV_REG_TO_MEM(parts.dither.reg, parts.dither.offset_ebp, EBP);
+                regs.recycle(parts.dither.reg);
+                regs.recycle(temp_reg);
+                regs.recycle(mBuilderContext.Rctx);
+
+            }
+
+            // allocate a register for the resulting pixel
+            pixel.setTo(regs.obtain(), &mCbFormat, FIRST);
+
+            build_component(pixel, parts, GGLFormat::ALPHA,    regs);
+
+            if (mAlphaTest!=GGL_ALWAYS) {
+                // only handle the z-write part here. We know z-test
+                // was successful, as well as alpha-test.
+                build_depth_test(parts, Z_WRITE);
+            }
+
+            build_component(pixel, parts, GGLFormat::RED,      regs);
+            build_component(pixel, parts, GGLFormat::GREEN,    regs);
+            build_component(pixel, parts, GGLFormat::BLUE,     regs);
+
+            pixel.flags |= CORRUPTIBLE;
+        }
+
+        if (registerFile().status()) {
+            return registerFile().status();
+        }
+
+        if (pixel.reg == -1) {
+            // be defensive here. if we're here it's probably
+            // that this whole fragment is a no-op.
+            pixel = mDstPixel;
+        }
+
+        if (!mAllMasked) {
+            // logic operation
+            build_logic_op(pixel, regs);
+
+            // masking
+            build_masking(pixel, regs);
+
+            comment("store");
+            parts.cbPtr.reg = regs.obtain();
+            MOV_MEM_TO_REG(parts.cbPtr.offset_ebp, EBP, parts.cbPtr.reg);
+            store(parts.cbPtr, pixel, WRITE_BACK);
+            MOV_REG_TO_MEM(parts.cbPtr.reg, parts.cbPtr.offset_ebp, EBP);
+            regs.recycle(parts.cbPtr.reg);
+            regs.recycle(pixel.reg);
+        }
+    }
+
+    if (registerFile().status())
+        return registerFile().status();
+
+    // update the iterated color...
+    if (parts.reload != 3) {
+        build_smooth_shade(parts);
+    }
+
+    // update iterated z
+    build_iterate_z(parts);
+
+    // update iterated fog
+    build_iterate_f(parts);
+
+    //SUB_IMM_TO_REG(1<<16, parts.count.reg);
+    SUB_IMM_TO_MEM(1<<16, parts.count.offset_ebp, EBP);
+
+    JCC(Mnemonic_JNS, "fragment_loop");
+    label("fragment_end");
+    int update_esp_offset, shrink_esp_offset;
+    update_esp_offset = shrink_esp_offset = -mCurSp - 12; // 12 is ebx, esi, edi
+    update_esp(update_esp_offset);
+    shrink_esp(shrink_esp_offset);
+    return_work();
+
+    if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
+        if (mDepthTest!=GGL_ALWAYS) {
+            label("discard_before_textures");
+            build_iterate_texture_coordinates(parts);
+        }
+        label("discard_after_textures");
+        build_smooth_shade(parts);
+        build_iterate_z(parts);
+        build_iterate_f(parts);
+        if (!mAllMasked) {
+            //ADD_IMM_TO_REG(parts.cbPtr.size>>3, parts.cbPtr.reg);
+            ADD_IMM_TO_MEM(parts.cbPtr.size>>3, parts.cbPtr.offset_ebp, EBP);
+        }
+        SUB_IMM_TO_MEM(1<<16, parts.count.offset_ebp, EBP);
+        //SUB_IMM_TO_REG(1<<16, parts.count.reg);
+        JCC(Mnemonic_JNS, "fragment_loop");
+        update_esp_offset = shrink_esp_offset = -mCurSp - 12; // 12 is ebx, esi, edi
+        update_esp(update_esp_offset);
+        shrink_esp(shrink_esp_offset);
+        return_work();
+    }
+
+    return registerFile().status();
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::build_scanline_preparation(
+    fragment_parts_t& parts, const needs_t& needs)
+{
+    Scratch scratches(registerFile());
+
+    // compute count
+    comment("compute ct (# of pixels to process)");
+    int temp_reg;
+    parts.count.setTo(obtainReg());
+    int Rx = scratches.obtain();
+    int Ry = scratches.obtain();
+    // the only argument is +8 bytes relative to the current EBP
+    MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+    CONTEXT_LOAD(Rx, iterators.xl);
+    CONTEXT_LOAD(parts.count.reg, iterators.xr);
+    CONTEXT_LOAD(Ry, iterators.y);
+
+    // parts.count = iterators.xr - Rx
+    SUB_REG_TO_REG(Rx, parts.count.reg);
+    SUB_IMM_TO_REG(1, parts.count.reg);
+
+    if (mDithering) {
+        // parts.count.reg = 0xNNNNXXDD
+        // NNNN = count-1
+        // DD   = dither offset
+        // XX   = 0xxxxxxx (x = garbage)
+        Scratch scratches(registerFile());
+        int tx = scratches.obtain();
+        int ty = scratches.obtain();
+
+        MOV_REG_TO_REG(Rx,tx);
+        AND_IMM_TO_REG(GGL_DITHER_MASK, tx);
+        MOV_REG_TO_REG(Ry,ty);
+        AND_IMM_TO_REG(GGL_DITHER_MASK, ty);
+        SHL(GGL_DITHER_ORDER_SHIFT, ty);
+        ADD_REG_TO_REG(ty, tx);
+        SHL(16, parts.count.reg);
+        OR_REG_TO_REG(tx, parts.count.reg);
+        scratches.recycle(tx);
+        scratches.recycle(ty);
+    } else {
+        // parts.count.reg = 0xNNNN0000
+        // NNNN = count-1
+        SHL(16, parts.count.reg);
+    }
+    mCurSp = mCurSp - 4;
+    parts.count.offset_ebp = mCurSp; //ebx, esi, edi, parts.count.reg
+    MOV_REG_TO_MEM(parts.count.reg, parts.count.offset_ebp, EBP);
+    //PUSH(parts.count.reg);
+    recycleReg(parts.count.reg);
+    parts.count.reg=-1;
+    if (!mAllMasked) {
+        // compute dst ptr
+        comment("compute color-buffer pointer");
+        const int cb_bits = mCbFormat.size*8;
+        int Rs = scratches.obtain();
+        temp_reg = scratches.obtain();
+        CONTEXT_LOAD(Rs, state.buffers.color.stride);
+        MOVSX_REG_TO_REG(OpndSize_16, Ry, temp_reg);
+        MOVSX_REG_TO_REG(OpndSize_16, Rs, Rs);
+        IMUL(temp_reg, Rs);
+        scratches.recycle(temp_reg);
+        ADD_REG_TO_REG(Rx, Rs);
+
+        parts.cbPtr.setTo(obtainReg(), cb_bits);
+        CONTEXT_LOAD(parts.cbPtr.reg, state.buffers.color.data);
+        reg_t temp_reg_t;
+        temp_reg_t.setTo(Rs);
+        base_offset(parts.cbPtr, parts.cbPtr, temp_reg_t);
+
+        mCurSp = mCurSp - 4;
+        parts.cbPtr.offset_ebp = mCurSp; //ebx, esi, edi, parts.count.reg, parts.cbPtr.reg
+        MOV_REG_TO_MEM(parts.cbPtr.reg, parts.cbPtr.offset_ebp, EBP);
+        //PUSH(parts.cbPtr.reg);
+        recycleReg(parts.cbPtr.reg);
+        parts.cbPtr.reg=-1;
+        scratches.recycle(Rs);
+    }
+
+    // init fog
+    const int need_fog = GGL_READ_NEEDS(P_FOG, needs.p);
+    if (need_fog) {
+        comment("compute initial fog coordinate");
+        Scratch scratches(registerFile());
+        int ydfdy = scratches.obtain();
+        int dfdx = scratches.obtain();
+        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);
+        IMUL(Rx, dfdx);
+        CONTEXT_LOAD(ydfdy, iterators.ydfdy);
+        ADD_REG_TO_REG(ydfdy, dfdx); // Rx * dfdx + ydfdy
+        CONTEXT_STORE(dfdx, generated_vars.f);
+        scratches.recycle(dfdx);
+        scratches.recycle(ydfdy);
+    }
+
+    // init Z coordinate
+    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
+        parts.z = reg_t(obtainReg());
+        comment("compute initial Z coordinate");
+        Scratch scratches(registerFile());
+        int dzdx = scratches.obtain();
+        int ydzdy = parts.z.reg;
+        CONTEXT_LOAD(dzdx,  generated_vars.dzdx);   // 1.31 fixed-point
+        IMUL(Rx, dzdx);
+        CONTEXT_LOAD(ydzdy, iterators.ydzdy);       // 1.31 fixed-point
+        ADD_REG_TO_REG(dzdx, ydzdy);  // parts.z.reg = Rx * dzdx + ydzdy
+
+        mCurSp = mCurSp - 4;
+        parts.z.offset_ebp = mCurSp; //ebx, esi, edi, parts.count.reg, parts.cbPtr.reg, parts.z.reg
+        MOV_REG_TO_MEM(ydzdy, parts.z.offset_ebp, EBP);
+        //PUSH(ydzdy);
+        recycleReg(ydzdy);
+        parts.z.reg=-1;
+
+        // we're going to index zbase of parts.count
+        // zbase = base + (xl-count + stride*y)*2 by arm
+        // !!! Actually, zbase = base + (xl + stride*y)*2
+        int Rs = dzdx;
+        int zbase = scratches.obtain();
+        temp_reg = zbase;
+        CONTEXT_LOAD(Rs, state.buffers.depth.stride);
+        MOVSX_REG_TO_REG(OpndSize_16, Rs, Rs);
+        MOV_REG_TO_REG(Ry, temp_reg);
+        MOVSX_REG_TO_REG(OpndSize_16, temp_reg, temp_reg);
+        IMUL(temp_reg, Rs);
+        ADD_REG_TO_REG(Rx, Rs);
+        // load parts.count.reg
+        MOV_MEM_TO_REG(parts.count.offset_ebp, EBP, temp_reg);
+        SHR(16, temp_reg);
+        ADD_REG_TO_REG(temp_reg, Rs);
+        SHL(1, Rs);
+        CONTEXT_LOAD(zbase, state.buffers.depth.data);
+        ADD_REG_TO_REG(Rs, zbase);
+        CONTEXT_STORE(zbase, generated_vars.zbase);
+        scratches.recycle(zbase);
+        scratches.recycle(dzdx);
+    }
+    // the rgisters are all used up
+
+    // init texture coordinates
+    init_textures(parts.coords, reg_t(Rx), reg_t(Ry));
+    scratches.recycle(Ry);
+
+    // iterated color
+    init_iterated_color(parts, reg_t(Rx));
+
+    // init coverage factor application (anti-aliasing)
+    if (mAA) {
+        parts.covPtr.setTo(obtainReg(), 16);
+        CONTEXT_LOAD(parts.covPtr.reg, state.buffers.coverage);
+        SHL(1, Rx);
+        ADD_REG_TO_REG(Rx, parts.covPtr.reg);
+
+        mCurSp = mCurSp - 4;
+        parts.covPtr.offset_ebp = mCurSp;
+        MOV_REG_TO_MEM(parts.covPtr.reg, parts.covPtr.offset_ebp, EBP);
+        //PUSH(parts.covPtr.reg);
+        recycleReg(parts.covPtr.reg);
+        parts.covPtr.reg=-1;
+    }
+    scratches.recycle(Rx);
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::build_component( pixel_t& pixel,
+                                       fragment_parts_t& parts,
+                                       int component,
+                                       Scratch& regs)
+{
+    static char const * comments[] = {"alpha", "red", "green", "blue"};
+    comment(comments[component]);
+
+    // local register file
+    Scratch scratches(registerFile());
+    const int dst_component_size = pixel.component_size(component);
+
+    component_t temp(-1);
+    build_incoming_component( temp, dst_component_size,
+                              parts, component, scratches, regs);
+
+    if (mInfo[component].inDest) {
+        // blending...
+        build_blending( temp, mDstPixel, component, scratches );
+
+        // downshift component and rebuild pixel...
+        downshift(pixel, component, temp, parts.dither);
+    }
+}
+
+void GGLX86Assembler::build_incoming_component(
+    component_t& temp,
+    int dst_size,
+    fragment_parts_t& parts,
+    int component,
+    Scratch& scratches,
+    Scratch& global_regs)
+{
+    const uint32_t component_mask = 1<<component;
+
+    // Figure out what we need for the blending stage...
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) {
+        fs = GGL_ONE;
+    }
+
+    // Figure out what we need to extract and for what reason
+    const int blending = blending_codes(fs, fd);
+
+    // Are we actually going to blend?
+    const int need_blending = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
+
+    // expand the source if the destination has more bits
+    int need_expander = false;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT-1 ; i++) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if ((tmu.format_idx) &&
+                (parts.texel[i].component_size(component) < dst_size)) {
+            need_expander = true;
+        }
+    }
+
+    // do we need to extract this component?
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    const int blend_needs_alpha_source = (component==GGLFormat::ALPHA) &&
+                                         (isAlphaSourceNeeded());
+    int need_extract = mInfo[component].needed;
+    if (mInfo[component].inDest)
+    {
+        need_extract |= ((need_blending ?
+                          (blending & (BLEND_SRC|FACTOR_SRC)) : need_expander));
+        need_extract |= (mTextureMachine.mask != mTextureMachine.replaced);
+        need_extract |= mInfo[component].smooth;
+        need_extract |= mInfo[component].fog;
+        need_extract |= mDithering;
+        need_extract |= multiTexture;
+    }
+
+    if (need_extract) {
+        Scratch& regs = blend_needs_alpha_source ? global_regs : scratches;
+        component_t fragment;
+
+        // iterated color
+        fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
+        build_iterated_color(fragment, parts, component, regs);
+
+        // texture environment (decal, modulate, replace)
+        build_texture_environment(fragment, parts, component, regs);
+
+        // expand the source if the destination has more bits
+        if (need_expander && (fragment.size() < dst_size)) {
+            // we're here only if we fetched a texel
+            // (so we know for sure fragment is CORRUPTIBLE)
+            //fragment is stored on the stack
+            expand(fragment, fragment, dst_size);
+        }
+
+        mCurSp = mCurSp - 4;
+        fragment.offset_ebp = mCurSp;
+        MOV_REG_TO_MEM(fragment.reg, fragment.offset_ebp, EBP);
+        regs.recycle(fragment.reg);
+
+        // We have a few specific things to do for the alpha-channel
+        if ((component==GGLFormat::ALPHA) &&
+                (mInfo[component].needed || fragment.size()<dst_size))
+        {
+            // convert to integer_t first and make sure
+            // we don't corrupt a needed register
+            if (fragment.l) {
+                //component_t incoming(fragment);
+                // actually fragment is not corruptible
+                //modify(fragment, regs);
+                //MOV_REG_TO_REG(incoming.reg, fragment.reg);
+                SHR(fragment.l, fragment.offset_ebp, EBP);
+                fragment.h -= fragment.l;
+                fragment.l = 0;
+            }
+
+            // I haven't found any case to trigger coverage and the following alpha test (mAlphaTest != GGL_ALWAYS)
+            fragment.reg = regs.obtain();
+            MOV_MEM_TO_REG(fragment.offset_ebp, EBP, fragment.reg);
+
+            // coverage factor application
+            build_coverage_application(fragment, parts, regs);
+            // alpha-test
+            build_alpha_test(fragment, parts);
+
+            MOV_REG_TO_MEM(fragment.reg, fragment.offset_ebp, EBP);
+            regs.recycle(fragment.reg);
+
+            if (blend_needs_alpha_source) {
+                // We keep only 8 bits for the blending stage
+                const int shift = fragment.h <= 8 ? 0 : fragment.h-8;
+
+                if (fragment.flags & CORRUPTIBLE) {
+                    fragment.flags &= ~CORRUPTIBLE;
+                    mAlphaSource.setTo(fragment.reg,
+                                       fragment.size(), fragment.flags, fragment.offset_ebp);
+                    //mCurSp = mCurSp - 4;
+                    //mAlphaSource.offset_ebp = mCurSp;
+                    if (shift) {
+                        SHR(shift, mAlphaSource.offset_ebp, EBP);
+                    }
+                } else {
+                    // XXX: it would better to do this in build_blend_factor()
+                    // so we can avoid the extra MOV below.
+                    mAlphaSource.setTo(regs.obtain(),
+                                       fragment.size(), CORRUPTIBLE);
+                    mCurSp = mCurSp - 4;
+                    mAlphaSource.offset_ebp = mCurSp;
+                    if (shift) {
+                        MOV_MEM_TO_REG(fragment.offset_ebp, EBP, mAlphaSource.reg);
+                        SHR(shift, mAlphaSource.reg);
+                    } else {
+                        MOV_MEM_TO_REG(fragment.offset_ebp, EBP, mAlphaSource.reg);
+                    }
+                    MOV_REG_TO_MEM(mAlphaSource.reg, mAlphaSource.offset_ebp, EBP);
+                    regs.recycle(mAlphaSource.reg);
+                }
+                mAlphaSource.s -= shift;
+
+            }
+        }
+
+        // fog...
+        build_fog( fragment, component, regs );
+
+        temp = fragment;
+    } else {
+        if (mInfo[component].inDest) {
+            // extraction not needed and replace
+            // we just select the right component
+            if ((mTextureMachine.replaced & component_mask) == 0) {
+                // component wasn't replaced, so use it!
+                temp = component_t(parts.iterated, component);
+            }
+            for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+                const texture_unit_t& tmu = mTextureMachine.tmu[i];
+                if ((tmu.mask & component_mask) &&
+                        ((tmu.replaced & component_mask) == 0)) {
+                    temp = component_t(parts.texel[i], component);
+                }
+            }
+        }
+    }
+}
+
+bool GGLX86Assembler::isAlphaSourceNeeded() const
+{
+    // XXX: also needed for alpha-test
+    const int bs = mBlendSrc;
+    const int bd = mBlendDst;
+    return  bs==GGL_SRC_ALPHA_SATURATE ||
+            bs==GGL_SRC_ALPHA || bs==GGL_ONE_MINUS_SRC_ALPHA ||
+            bd==GGL_SRC_ALPHA || bd==GGL_ONE_MINUS_SRC_ALPHA ;
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::build_smooth_shade(fragment_parts_t& parts)
+{
+    if (mSmooth && !parts.iterated_packed) {
+        // update the iterated color in a pipelined way...
+        comment("update iterated color");
+        Scratch scratches(registerFile());
+        mBuilderContext.Rctx = scratches.obtain();
+        MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+
+        const int reload = parts.reload;
+        for (int i=0 ; i<4 ; i++) {
+            if (!mInfo[i].iterated)
+                continue;
+
+            int dx = parts.argb_dx[i].reg;
+            int c = parts.argb[i].reg;
+            dx = scratches.obtain();
+            c = scratches.obtain();
+            CONTEXT_LOAD(dx, generated_vars.argb[i].dx);
+            CONTEXT_LOAD(c, generated_vars.argb[i].c);
+
+            //if (reload & 1) {
+            //    c = scratches.obtain();
+            //    CONTEXT_LOAD(c, generated_vars.argb[i].c);
+            //}
+            //if (reload & 2) {
+            //    dx = scratches.obtain();
+            //    CONTEXT_LOAD(dx, generated_vars.argb[i].dx);
+            //}
+
+            if (mSmooth) {
+                ADD_REG_TO_REG(dx, c);
+            }
+
+            CONTEXT_STORE(c, generated_vars.argb[i].c);
+            scratches.recycle(c);
+            scratches.recycle(dx);
+            //if (reload & 1) {
+            //    CONTEXT_STORE(c, generated_vars.argb[i].c);
+            //    scratches.recycle(c);
+            //}
+            //if (reload & 2) {
+            //    scratches.recycle(dx);
+            //}
+        }
+        scratches.recycle(mBuilderContext.Rctx);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::build_coverage_application(component_t& fragment,
+        fragment_parts_t& parts, Scratch& regs)
+{
+    // here fragment.l is guarenteed to be 0
+    if (mAA) {
+        // coverages are 1.15 fixed-point numbers
+        comment("coverage application");
+
+        component_t incoming(fragment);
+        modify(fragment, regs);
+
+        Scratch scratches(registerFile());
+        int cf = scratches.obtain();
+        parts.covPtr.reg = scratches.obtain();
+        MOV_MEM_TO_REG(parts.covPtr.offset_ebp, EBP, parts.covPtr.reg);
+        MOVZX_MEM_TO_REG(OpndSize_16, parts.covPtr.reg, 2, cf); // refer to LDRH definition
+        scratches.recycle(parts.covPtr.reg);
+        if (fragment.h > 31) {
+            fragment.h--;
+
+            int flag_push_edx = 0;
+            int flag_reserve_edx = 0;
+            int temp_reg2 = -1;
+            int edx_offset_ebp = 0;
+            if(scratches.isUsed(EDX) == 1) {
+                if(incoming.reg != EDX && cf != EDX) {
+                    flag_push_edx = 1;
+                    mCurSp = mCurSp - 4;
+                    edx_offset_ebp = mCurSp;
+                    MOV_REG_TO_MEM(EDX, edx_offset_ebp, EBP);
+                }
+            }
+            else {
+                flag_reserve_edx = 1;
+                scratches.reserve(EDX);
+            }
+            if(scratches.isUsed(EAX)) {
+                if( cf == EAX || incoming.reg == EAX) {
+                    MOVSX_REG_TO_REG(OpndSize_16, cf, cf);
+                    if(cf == EAX)
+                        IMUL(incoming.reg);
+                    else
+                        IMUL(cf);
+                    SHL(16, EDX);
+                    SHR(16, EAX);
+                    MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                    MOV_REG_TO_REG(EDX, incoming.reg);
+                }
+                else {
+                    int eax_offset_ebp = 0;
+                    if(scratches.countFreeRegs() > 0) {
+                        temp_reg2 = scratches.obtain();
+                        MOV_REG_TO_REG(EAX, temp_reg2);
+                    }
+                    else {
+                        mCurSp = mCurSp - 4;
+                        eax_offset_ebp = mCurSp;
+                        MOV_REG_TO_MEM(EAX, eax_offset_ebp, EBP);
+                    }
+                    MOV_REG_TO_REG(cf, EAX);
+                    MOVSX_REG_TO_REG(OpndSize_16, EAX, EAX);
+                    IMUL(incoming.reg);
+                    SHL(16, EDX);
+                    SHR(16, EAX);
+                    MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                    MOV_REG_TO_REG(EDX, incoming.reg);
+                    if(temp_reg2 > -1) {
+                        MOV_REG_TO_REG(temp_reg2, EAX);
+                        scratches.recycle(temp_reg2);
+                    }
+                    else {
+                        MOV_MEM_TO_REG(eax_offset_ebp, EBP, EAX);
+                    }
+                }
+            }
+            else {
+                MOV_REG_TO_REG(cf, EAX);
+                MOVSX_REG_TO_REG(OpndSize_16, EAX, EAX);
+                IMUL(incoming.reg);
+                SHL(16, EDX);
+                SHR(16, EAX);
+                MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                MOV_REG_TO_REG(EDX, incoming.reg);
+            }
+            if(flag_push_edx == 1) {
+                MOV_MEM_TO_REG(edx_offset_ebp, EBP, EDX);
+            }
+            if(flag_reserve_edx ==1)
+                scratches.recycle(EDX);
+
+            MOV_REG_TO_REG(incoming.reg, fragment.reg);
+
+            //IMUL(cf, incoming.reg);
+        } else {
+            MOV_REG_TO_REG(incoming.reg, fragment.reg);
+            SHL(1, fragment.reg);
+
+            int flag_push_edx = 0;
+            int flag_reserve_edx = 0;
+            int temp_reg2 = -1;
+            int edx_offset_ebp = 0;
+            if(scratches.isUsed(EDX) == 1) {
+                if(fragment.reg != EDX && cf != EDX) {
+                    flag_push_edx = 1;
+                    mCurSp = mCurSp - 4;
+                    edx_offset_ebp = mCurSp;
+                    MOV_REG_TO_MEM(EDX, edx_offset_ebp, EBP);
+                }
+            }
+            else {
+                flag_reserve_edx = 1;
+                scratches.reserve(EDX);
+            }
+            if(scratches.isUsed(EAX)) {
+                if( cf == EAX || fragment.reg == EAX) {
+                    MOVSX_REG_TO_REG(OpndSize_16, cf, cf);
+                    if(cf == EAX)
+                        IMUL(fragment.reg);
+                    else
+                        IMUL(cf);
+                    SHL(16, EDX);
+                    SHR(16, EAX);
+                    MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                    MOV_REG_TO_REG(EDX, fragment.reg);
+                }
+                else {
+                    int eax_offset_ebp = 0;
+                    if(scratches.countFreeRegs() > 0) {
+                        temp_reg2 = scratches.obtain();
+                        MOV_REG_TO_REG(EAX, temp_reg2);
+                    }
+                    else {
+                        mCurSp = mCurSp - 4;
+                        eax_offset_ebp = mCurSp;
+                        MOV_REG_TO_MEM(EAX, eax_offset_ebp, EBP);
+                    }
+                    MOV_REG_TO_REG(cf, EAX);
+                    MOVSX_REG_TO_REG(OpndSize_16, EAX, EAX);
+                    IMUL(fragment.reg);
+                    SHL(16, EDX);
+                    SHR(16, EAX);
+                    MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                    MOV_REG_TO_REG(EDX, fragment.reg);
+                    if(temp_reg2 > -1) {
+                        MOV_REG_TO_REG(temp_reg2, EAX);
+                        scratches.recycle(temp_reg2);
+                    }
+                    else {
+                        MOV_MEM_TO_REG(eax_offset_ebp, EBP, EAX);
+                    }
+                }
+            }
+            else {
+                MOV_REG_TO_REG(cf, EAX);
+                MOVSX_REG_TO_REG(OpndSize_16, EAX, EAX);
+                IMUL(fragment.reg);
+                SHL(16, EDX);
+                SHR(16, EAX);
+                MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                MOV_REG_TO_REG(EDX, fragment.reg);
+            }
+            if(flag_push_edx == 1) {
+                MOV_MEM_TO_REG(edx_offset_ebp, EBP, EDX);
+            }
+            if(flag_reserve_edx ==1)
+                scratches.recycle(EDX);
+
+            //IMUL(cf, fragment.reg);
+        }
+        scratches.recycle(cf);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::build_alpha_test(component_t& fragment,
+                                       const fragment_parts_t& parts)
+{
+    if (mAlphaTest != GGL_ALWAYS) {
+        comment("Alpha Test");
+        Scratch scratches(registerFile());
+        int ref = scratches.obtain();
+        mBuilderContext.Rctx  = scratches.obtain();
+        MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+        const int shift = GGL_COLOR_BITS-fragment.size();
+        CONTEXT_LOAD(ref, state.alpha_test.ref);
+        scratches.recycle(mBuilderContext.Rctx);
+        if (shift) {
+            SHR(shift, ref);
+            CMP_REG_TO_REG(ref, fragment.reg);
+        } else   CMP_REG_TO_REG(ref, fragment.reg);
+        Mnemonic cc = Mnemonic_NULL;
+        //int cc = NV;
+        switch (mAlphaTest) {
+        case GGL_NEVER:
+            JMP("discard_after_textures");
+            return;
+            break;
+        case GGL_LESS:
+            cc = Mnemonic_JNL;
+            break;
+        case GGL_EQUAL:
+            cc = Mnemonic_JNE;
+            break;
+        case GGL_LEQUAL:
+            cc = Mnemonic_JB;
+            break;
+        case GGL_GREATER:
+            cc = Mnemonic_JLE;
+            break;
+        case GGL_NOTEQUAL:
+            cc = Mnemonic_JE;
+            break;
+        case GGL_GEQUAL:
+            cc = Mnemonic_JNC;
+            break;
+        }
+        JCC(cc, "discard_after_textures");
+        //B(cc^1, "discard_after_textures");
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::build_depth_test(
+    const fragment_parts_t& parts, uint32_t mask)
+{
+    mask &= Z_TEST|Z_WRITE;
+    int store_flag = 0;
+    const needs_t& needs = mBuilderContext.needs;
+    const int zmask = GGL_READ_NEEDS(P_MASK_Z, needs.p);
+    Scratch scratches(registerFile());
+
+    if (mDepthTest != GGL_ALWAYS || zmask) {
+        Mnemonic ic = Mnemonic_NULL;
+        switch (mDepthTest) {
+        case GGL_LESS:
+            ic = Mnemonic_JBE;
+            break;
+        case GGL_EQUAL:
+            ic = Mnemonic_JNE;
+            break;
+        case GGL_LEQUAL:
+            ic = Mnemonic_JB;
+            break;
+        case GGL_GREATER:
+            ic = Mnemonic_JGE;
+            break;
+        case GGL_NOTEQUAL:
+            ic = Mnemonic_JE;
+            break;
+        case GGL_GEQUAL:
+            ic = Mnemonic_JA;
+            break;
+        case GGL_NEVER:
+            // this never happens, because it's taken care of when
+            // computing the needs. but we keep it for completness.
+            comment("Depth Test (NEVER)");
+            JMP("discard_before_textures");
+            return;
+        case GGL_ALWAYS:
+            // we're here because zmask is enabled
+            mask &= ~Z_TEST;    // test always passes.
+            break;
+        }
+
+
+        if ((mask & Z_WRITE) && !zmask) {
+            mask &= ~Z_WRITE;
+        }
+
+        if (!mask)
+            return;
+
+        comment("Depth Test");
+
+        int zbase = scratches.obtain();
+        mBuilderContext.Rctx = scratches.obtain();
+        MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+        CONTEXT_LOAD(zbase, generated_vars.zbase);  // stall
+        scratches.recycle(mBuilderContext.Rctx);
+
+        int temp_reg1 = scratches.obtain();
+        int depth = scratches.obtain();
+        int z = parts.z.reg;
+        MOV_MEM_TO_REG(parts.count.offset_ebp, PhysicalReg_EBP, temp_reg1);
+        SHR(15, temp_reg1);
+        SUB_REG_TO_REG(temp_reg1, zbase);
+
+        // above does zbase = zbase + ((count >> 16) << 1)
+
+        if (mask & Z_TEST) {
+            MOVZX_MEM_TO_REG(OpndSize_16, zbase, 0, depth);
+            MOV_MEM_TO_REG(parts.z.offset_ebp, PhysicalReg_EBP, temp_reg1);
+            SHR(16, temp_reg1);
+            CMP_REG_TO_REG(temp_reg1, depth);
+            JCC(ic, "discard_before_textures");
+
+        }
+        if (mask & Z_WRITE) {
+            if (mask == Z_WRITE) {
+                // only z-write asked, cc is meaningless
+                store_flag = 1;
+            }
+            // actually it must be stored since the above branch is not taken
+            MOV_REG_TO_MEM(temp_reg1, 0, zbase, OpndSize_16);
+        }
+        scratches.recycle(temp_reg1);
+        scratches.recycle(zbase);
+        scratches.recycle(depth);
+    }
+}
+
+void GGLX86Assembler::build_iterate_z(const fragment_parts_t& parts)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
+        Scratch scratches(registerFile());
+        int dzdx = scratches.obtain();
+        mBuilderContext.Rctx = scratches.obtain();
+        MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+        CONTEXT_LOAD(dzdx, generated_vars.dzdx);    // stall
+        scratches.recycle(mBuilderContext.Rctx);
+        ADD_REG_TO_MEM(dzdx, EBP, parts.z.offset_ebp);
+        scratches.recycle(dzdx);
+    }
+}
+
+void GGLX86Assembler::build_iterate_f(const fragment_parts_t& parts)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    if (GGL_READ_NEEDS(P_FOG, needs.p)) {
+        Scratch scratches(registerFile());
+        int dfdx = scratches.obtain();
+        int f = scratches.obtain();
+        mBuilderContext.Rctx = scratches.obtain();
+        MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+        CONTEXT_LOAD(f,     generated_vars.f);
+        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);   // stall
+        ADD_REG_TO_REG(dfdx, f);
+        CONTEXT_STORE(f,    generated_vars.f);
+        scratches.recycle(mBuilderContext.Rctx);
+        scratches.recycle(dfdx);
+        scratches.recycle(f);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::build_logic_op(pixel_t& pixel, Scratch& regs)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+    if (opcode == GGL_COPY)
+        return;
+
+    comment("logic operation");
+
+    pixel_t s(pixel);
+    if (!(pixel.flags & CORRUPTIBLE)) {
+        pixel.reg = regs.obtain();
+        pixel.flags |= CORRUPTIBLE;
+    }
+
+    pixel_t d(mDstPixel);
+    d.reg = regs.obtain();
+    MOV_MEM_TO_REG(mDstPixel.offset_ebp, EBP, d.reg);
+    switch(opcode) {
+    case GGL_CLEAR:
+        MOV_IMM_TO_REG(0, pixel.reg);
+        break;
+    case GGL_AND:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        AND_REG_TO_REG(s.reg, pixel.reg);
+        break;
+    case GGL_AND_REVERSE:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        NOT(pixel.reg);
+        AND_REG_TO_REG(s.reg, pixel.reg);
+        break;
+    case GGL_COPY:
+        break;
+    case GGL_AND_INVERTED:
+        MOV_REG_TO_REG(s.reg, pixel.reg);
+        NOT(pixel.reg);
+        AND_REG_TO_REG(d.reg, pixel.reg);
+        break;
+    case GGL_NOOP:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        break;
+    case GGL_XOR:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        XOR(s.reg, pixel.reg);
+        break;
+    case GGL_OR:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        OR_REG_TO_REG(s.reg, pixel.reg);
+        break;
+    case GGL_NOR:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        OR_REG_TO_REG(s.reg, pixel.reg);
+        NOT(pixel.reg);
+        break;
+    case GGL_EQUIV:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        XOR(s.reg, pixel.reg);
+        NOT(pixel.reg);
+        break;
+    case GGL_INVERT:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        NOT(pixel.reg);
+        break;
+    case GGL_OR_REVERSE:    // s | ~d == ~(~s & d)
+        MOV_REG_TO_REG(s.reg, pixel.reg);
+        NOT(pixel.reg);
+        AND_REG_TO_REG(d.reg, pixel.reg);
+        NOT(pixel.reg);
+        break;
+    case GGL_COPY_INVERTED:
+        MOV_REG_TO_REG(s.reg, pixel.reg);
+        NOT(pixel.reg);
+        break;
+    case GGL_OR_INVERTED:   // ~s | d == ~(s & ~d)
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        NOT(pixel.reg);
+        AND_REG_TO_REG(s.reg, pixel.reg);
+        NOT(pixel.reg);
+        break;
+    case GGL_NAND:
+        MOV_REG_TO_REG(d.reg, pixel.reg);
+        AND_REG_TO_REG(s.reg, pixel.reg);
+        NOT(pixel.reg);
+        break;
+    case GGL_SET:
+        MOV_IMM_TO_REG(0, pixel.reg);
+        NOT(pixel.reg);
+        break;
+    };
+    regs.recycle(d.reg);
+}
+
+// ---------------------------------------------------------------------------
+
+
+void GGLX86Assembler::build_and_immediate(int d, int s, uint32_t mask, int bits)
+{
+    uint32_t rot;
+    uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
+    mask &= size;
+
+    if (mask == size) {
+        if (d != s)
+            MOV_REG_TO_REG(s, d);
+        return;
+    }
+
+    MOV_REG_TO_REG(s, d);
+    AND_IMM_TO_REG(mask, d);
+}
+
+void GGLX86Assembler::build_masking(pixel_t& pixel, Scratch& regs)
+{
+    if (!mMasking || mAllMasked) {
+        return;
+    }
+
+    comment("color mask");
+
+    pixel_t fb(mDstPixel);
+    fb.reg = regs.obtain();
+    MOV_MEM_TO_REG(mDstPixel.offset_ebp, EBP, fb.reg);
+    pixel_t s(pixel);
+    if (!(pixel.flags & CORRUPTIBLE)) {
+        pixel.reg = regs.obtain();
+        pixel.flags |= CORRUPTIBLE;
+    }
+
+    int mask = 0;
+    for (int i=0 ; i<4 ; i++) {
+        const int component_mask = 1<<i;
+        const int h = fb.format.c[i].h;
+        const int l = fb.format.c[i].l;
+        if (h && (!(mMasking & component_mask))) {
+            mask |= ((1<<(h-l))-1) << l;
+        }
+    }
+
+    // There is no need to clear the masked components of the source
+    // (unless we applied a logic op), because they're already zeroed
+    // by construction (masked components are not computed)
+
+    if (mLogicOp) {
+        const needs_t& needs = mBuilderContext.needs;
+        const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+        if (opcode != GGL_CLEAR) {
+            // clear masked component of source
+            build_and_immediate(pixel.reg, s.reg, mask, fb.size());
+            s = pixel;
+        }
+    }
+
+    // clear non masked components of destination
+    build_and_immediate(fb.reg, fb.reg, ~mask, fb.size());
+
+    // or back the channels that were masked
+    if (s.reg == fb.reg) {
+        // this is in fact a MOV
+        if (s.reg == pixel.reg) {
+            // ugh. this in in fact a nop
+        } else {
+            MOV_REG_TO_REG(fb.reg, pixel.reg);
+        }
+    } else {
+        MOV_REG_TO_REG(fb.reg, pixel.reg);
+        OR_REG_TO_REG(s.reg, pixel.reg);
+    }
+    MOV_REG_TO_MEM(fb.reg, mDstPixel.offset_ebp, EBP);
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::base_offset(pointer_t& d, pointer_t& b, const reg_t& o)
+{
+// d and b are the same reference
+    Scratch scratches(registerFile());
+    int temp_reg = scratches.obtain();
+    switch (b.size) {
+    case 32:
+        MOV_REG_TO_REG(b.reg, temp_reg);
+        MOV_REG_TO_REG(o.reg, d.reg);
+        SHL(2,d.reg);
+        ADD_REG_TO_REG(temp_reg, d.reg);
+        break;
+    case 24:
+        if (d.reg == b.reg) {
+            MOV_REG_TO_REG(b.reg, temp_reg);
+            MOV_REG_TO_REG(o.reg, d.reg);
+            SHL(1,d.reg);
+            ADD_REG_TO_REG(temp_reg, d.reg);
+            ADD_REG_TO_REG(o.reg, d.reg);
+        } else {
+            MOV_REG_TO_REG(o.reg, temp_reg);
+            SHL(1,temp_reg);
+            MOV_REG_TO_REG(temp_reg, d.reg);
+            ADD_REG_TO_REG(o.reg, d.reg);
+            ADD_REG_TO_REG(b.reg, d.reg);
+        }
+        break;
+    case 16:
+        MOV_REG_TO_REG(b.reg, temp_reg);
+        MOV_REG_TO_REG(o.reg, d.reg);
+        SHL(1,d.reg);
+        ADD_REG_TO_REG(temp_reg, d.reg);
+        break;
+    case 8:
+        MOV_REG_TO_REG(b.reg, temp_reg);
+        MOV_REG_TO_REG(o.reg, d.reg);
+        ADD_REG_TO_REG(temp_reg, d.reg);
+        break;
+    }
+    scratches.recycle(temp_reg);
+}
+
+// ----------------------------------------------------------------------------
+// cheezy register allocator...
+// ----------------------------------------------------------------------------
+
+void X86RegisterAllocator::reset()
+{
+    mRegs.reset();
+}
+
+int X86RegisterAllocator::reserveReg(int reg)
+{
+    return mRegs.reserve(reg);
+}
+
+int X86RegisterAllocator::obtainReg()
+{
+    return mRegs.obtain();
+}
+
+void X86RegisterAllocator::recycleReg(int reg)
+{
+    mRegs.recycle(reg);
+}
+
+X86RegisterAllocator::RegisterFile& X86RegisterAllocator::registerFile()
+{
+    return mRegs;
+}
+
+// ----------------------------------------------------------------------------
+
+X86RegisterAllocator::RegisterFile::RegisterFile()
+    : mRegs(0), mTouched(0), mStatus(0)
+{
+    //reserve(PhysicalReg_EBP);
+    //reserve(PhysicalReg_ESP);
+}
+
+X86RegisterAllocator::RegisterFile::RegisterFile(const RegisterFile& rhs)
+    : mRegs(rhs.mRegs), mTouched(rhs.mTouched)
+{
+}
+
+X86RegisterAllocator::RegisterFile::~RegisterFile()
+{
+}
+
+bool X86RegisterAllocator::RegisterFile::operator == (const RegisterFile& rhs) const
+{
+    return (mRegs == rhs.mRegs);
+}
+
+void X86RegisterAllocator::RegisterFile::reset()
+{
+    mRegs = mTouched = mStatus = 0;
+}
+
+int X86RegisterAllocator::RegisterFile::reserve(int reg)
+{
+    LOG_ALWAYS_FATAL_IF(isUsed(reg),
+                        "reserving register %d, but already in use",
+                        reg);
+    if(isUsed(reg)) return -1;
+    mRegs |= (1<<reg);
+    mTouched |= mRegs;
+    return reg;
+}
+
+void X86RegisterAllocator::RegisterFile::reserveSeveral(uint32_t regMask)
+{
+    mRegs |= regMask;
+    mTouched |= regMask;
+}
+
+int X86RegisterAllocator::RegisterFile::isUsed(int reg) const
+{
+    LOG_ALWAYS_FATAL_IF(reg>=6, "invalid register %d", reg);
+    return mRegs & (1<<reg);
+}
+
+int X86RegisterAllocator::RegisterFile::obtain()
+{
+//multiplication result is in edx:eax
+//ebx, ecx, edi, esi, eax, edx
+    const char priorityList[6] = { PhysicalReg_EBX, PhysicalReg_ECX,PhysicalReg_EDI, PhysicalReg_ESI, PhysicalReg_EAX, PhysicalReg_EDX };
+
+    const int nbreg = sizeof(priorityList);
+    int i, r;
+    for (i=0 ; i<nbreg ; i++) {
+        r = priorityList[i];
+        if (!isUsed(r)) {
+            break;
+        }
+    }
+    // this is not an error anymore because, we'll try again with
+    // a lower optimization level.
+    ALOGE_IF(i >= nbreg, "pixelflinger ran out of registers\n");
+    if (i >= nbreg) {
+        mStatus |= OUT_OF_REGISTERS;
+        // we return SP so we can more easily debug things
+        // the code will never be run anyway.
+        printf("pixelflinger ran out of registers\n");
+        return PhysicalReg_ESP;
+        //return -1;
+    }
+    reserve(r);
+    return r;
+}
+
+bool X86RegisterAllocator::RegisterFile::hasFreeRegs() const
+{
+    return ((mRegs & 0x3F) == 0x3F) ? false : true;
+}
+
+int X86RegisterAllocator::RegisterFile::countFreeRegs() const
+{
+    int f = ~mRegs & 0x3F;
+    // now count number of 1
+    f = (f & 0x5555) + ((f>>1) & 0x5555);
+    f = (f & 0x3333) + ((f>>2) & 0x3333);
+    f = (f & 0x0F0F) + ((f>>4) & 0x0F0F);
+    f = (f & 0x00FF) + ((f>>8) & 0x00FF);
+    return f;
+}
+
+void X86RegisterAllocator::RegisterFile::recycle(int reg)
+{
+    LOG_FATAL_IF(!isUsed(reg),
+                 "recycling unallocated register %d",
+                 reg);
+    mRegs &= ~(1<<reg);
+}
+
+void X86RegisterAllocator::RegisterFile::recycleSeveral(uint32_t regMask)
+{
+    LOG_FATAL_IF((mRegs & regMask)!=regMask,
+                 "recycling unallocated registers "
+                 "(recycle=%08x, allocated=%08x, unallocated=%08x)",
+                 regMask, mRegs, mRegs&regMask);
+    mRegs &= ~regMask;
+}
+
+uint32_t X86RegisterAllocator::RegisterFile::touched() const
+{
+    return mTouched;
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/x86/GGLX86Assembler.h b/libpixelflinger/codeflinger/x86/GGLX86Assembler.h
new file mode 100644
index 0000000..1960cfc
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/GGLX86Assembler.h
@@ -0,0 +1,563 @@
+/* libs/pixelflinger/codeflinger/x86/GGLX86Assembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGLX86ASSEMBLER_H
+#define ANDROID_GGLX86ASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <private/pixelflinger/ggl_context.h>
+
+#include "codeflinger/x86/X86Assembler.h"
+
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#define CONTEXT_LOAD(REG, FIELD) \
+    MOV_MEM_TO_REG(GGL_OFFSETOF(FIELD), mBuilderContext.Rctx, REG)
+
+#define CONTEXT_STORE(REG, FIELD) \
+    MOV_REG_TO_MEM(REG, GGL_OFFSETOF(FIELD), mBuilderContext.Rctx)
+
+class X86RegisterAllocator
+{
+public:
+    class RegisterFile;
+
+    RegisterFile&   registerFile();
+    int             reserveReg(int reg);
+    int             obtainReg();
+    void            recycleReg(int reg);
+    void            reset();
+
+    class RegisterFile
+    {
+    public:
+                            RegisterFile();
+                            RegisterFile(const RegisterFile& rhs);
+                            ~RegisterFile();
+
+                void        reset();
+
+                bool operator == (const RegisterFile& rhs) const;
+                bool operator != (const RegisterFile& rhs) const {
+                    return !operator == (rhs);
+                }
+
+                int         reserve(int reg);
+                void        reserveSeveral(uint32_t regMask);
+
+                void        recycle(int reg);
+                void        recycleSeveral(uint32_t regMask);
+
+                int         obtain();
+        inline  int         isUsed(int reg) const;
+
+                bool        hasFreeRegs() const;
+                int         countFreeRegs() const;
+
+                uint32_t    touched() const;
+        inline  uint32_t    status() const { return mStatus; }
+
+        enum {
+            OUT_OF_REGISTERS = 0x1
+        };
+
+    private:
+        uint32_t    mRegs;
+        uint32_t    mTouched;
+        uint32_t    mStatus;
+    };
+
+    class Scratch
+    {
+    public:
+            Scratch(RegisterFile& regFile)
+                : mRegFile(regFile), mScratch(0) {
+            }
+            ~Scratch() {
+                mRegFile.recycleSeveral(mScratch);
+            }
+        int obtain() {
+            int reg = mRegFile.obtain();
+            mScratch |= 1<<reg;
+            return reg;
+        }
+        void reserve(int reg) {
+            mRegFile.reserve(reg);
+            mScratch |= 1<<reg;
+        }
+        void recycle(int reg) {
+            mRegFile.recycle(reg);
+            mScratch &= ~(1<<reg);
+        }
+        bool isUsed(int reg) {
+            return (mScratch & (1<<reg));
+        }
+        int countFreeRegs() {
+            return mRegFile.countFreeRegs();
+        }
+    private:
+        RegisterFile&   mRegFile;
+        uint32_t        mScratch;
+    };
+
+/*
+// currently we don't use it
+
+    class Spill
+    {
+    public:
+        Spill(RegisterFile& regFile, X86Assembler& gen, uint32_t reglist)
+            : mRegFile(regFile), mGen(gen), mRegList(reglist), mCount(0)
+        {
+            if (reglist) {
+                int count = 0;
+                while (reglist) {
+                    count++;
+                    reglist &= ~(1 << (31 - __builtin_clz(reglist)));
+                }
+                if (count == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    // move to the stack
+                } else {
+                    // move to the stack
+                }
+                mRegFile.recycleSeveral(mRegList);
+                mCount = count;
+            }
+        }
+        ~Spill() {
+            if (mRegList) {
+                if (mCount == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    // move to the stack
+                } else {
+                }
+                mRegFile.reserveSeveral(mRegList);
+            }
+        }
+    private:
+        RegisterFile&           mRegFile;
+        X86Assembler&           mGen;
+        uint32_t                mRegList;
+        int                     mCount;
+    };
+*/
+
+private:
+    RegisterFile    mRegs;
+};
+
+// ----------------------------------------------------------------------------
+
+class GGLX86Assembler : public X86Assembler, public X86RegisterAllocator
+{
+public:
+
+            GGLX86Assembler(const sp<Assembly>& assembly);
+            ~GGLX86Assembler();
+
+    char*   base() const { return 0; } // XXX
+    char*   pc() const { return 0; } // XXX
+
+    void    reset(int opt_level);
+
+
+        // generate scanline code for given needs
+    int     scanline(const needs_t& needs, context_t const* c);
+    int     scanline_core(const needs_t& needs, context_t const* c);
+
+    enum {
+        CLEAR_LO    = 0x0001,
+        CLEAR_HI    = 0x0002,
+        CORRUPTIBLE = 0x0004,
+        FIRST       = 0x0008
+    };
+
+    enum { //load/store flags
+        WRITE_BACK  = 0x0001
+    };
+
+    struct reg_t {
+        reg_t() : reg(-1), flags(0), offset_ebp(0) {
+        }
+        reg_t(int r, int f=0, int offset=0)
+            : reg(r), flags(f), offset_ebp(offset) {
+        }
+        void setTo(int r, int f=0, int offset=0) {
+            reg=r; flags=f; offset_ebp=offset;
+        }
+        int         reg;
+        uint16_t    flags;
+        int         offset_ebp;
+    };
+
+    struct integer_t : public reg_t {
+        integer_t() : reg_t(), s(0) {
+        }
+        integer_t(int r, int sz=32, int f=0, int offset=0)
+            : reg_t(r, f, offset), s(sz) {
+        }
+        void setTo(int r, int sz=32, int f=0, int offset=0) {
+            reg_t::setTo(r, f, offset); s=sz;
+        }
+        int8_t s;
+        inline int size() const { return s; }
+    };
+
+    struct pixel_t : public reg_t {
+        pixel_t() : reg_t() {
+            memset(&format, 0, sizeof(GGLFormat));
+        }
+        pixel_t(int r, const GGLFormat* fmt, int f=0, int offset=0)
+            : reg_t(r, f, offset), format(*fmt) {
+        }
+        void setTo(int r, const GGLFormat* fmt, int f=0, int offset=0) {
+            reg_t::setTo(r, f, offset); format = *fmt;
+        }
+        GGLFormat format;
+        inline int hi(int c) const { return format.c[c].h; }
+        inline int low(int c) const { return format.c[c].l; }
+        inline int mask(int c) const { return ((1<<size(c))-1) << low(c); }
+        inline int size() const { return format.size*8; }
+        inline int size(int c) const { return component_size(c); }
+        inline int component_size(int c) const { return hi(c) - low(c); }
+    };
+
+    struct component_t : public reg_t {
+        component_t() : reg_t(), h(0), l(0) {
+        }
+        component_t(int r, int f=0, int offset=0)
+            : reg_t(r, f, offset), h(0), l(0) {
+        }
+        component_t(int r, int lo, int hi, int f=0, int offset=0)
+            : reg_t(r, f, offset), h(hi), l(lo) {
+        }
+        explicit component_t(const integer_t& rhs)
+            : reg_t(rhs.reg, rhs.flags, rhs.offset_ebp), h(rhs.s), l(0) {
+        }
+        explicit component_t(const pixel_t& rhs, int component) {
+            setTo(  rhs.reg,
+                    rhs.format.c[component].l,
+                    rhs.format.c[component].h,
+                    rhs.flags|CLEAR_LO|CLEAR_HI, rhs.offset_ebp);
+        }
+        void setTo(int r, int lo=0, int hi=0, int f=0, int offset=0) {
+            reg_t::setTo(r, f, offset); h=hi; l=lo;
+        }
+        int8_t h;
+        int8_t l;
+        inline int size() const { return h-l; }
+    };
+
+    struct pointer_t : public reg_t {
+        pointer_t() : reg_t(), size(0) {
+        }
+        pointer_t(int r, int s, int f=0, int offset=0)
+            : reg_t(r, f, offset), size(s) {
+        }
+        void setTo(int r, int s, int f=0, int offset=0) {
+            reg_t::setTo(r, f, offset); size=s;
+        }
+        int8_t size;
+    };
+
+
+private:
+    struct tex_coord_t {
+        reg_t       s;
+        reg_t       t;
+        pointer_t   ptr;
+    };
+
+    struct fragment_parts_t {
+        uint32_t    packed  : 1;
+        uint32_t    reload  : 2;
+        uint32_t    iterated_packed  : 1;
+        pixel_t     iterated;
+        pointer_t   cbPtr;
+        pointer_t   covPtr;
+        reg_t       count;
+        reg_t       argb[4];
+        reg_t       argb_dx[4];
+        reg_t       z;
+        reg_t       dither;
+        pixel_t     texel[GGL_TEXTURE_UNIT_COUNT];
+        tex_coord_t coords[GGL_TEXTURE_UNIT_COUNT];
+    };
+
+    struct texture_unit_t {
+        int         format_idx;
+        GGLFormat   format;
+        int         bits;
+        int         swrap;
+        int         twrap;
+        int         env;
+        int         pot;
+        int         linear;
+        uint8_t     mask;
+        uint8_t     replaced;
+    };
+
+    struct texture_machine_t {
+        texture_unit_t  tmu[GGL_TEXTURE_UNIT_COUNT];
+        uint8_t         mask;
+        uint8_t         replaced;
+        uint8_t         directTexture;
+        uint8_t         activeUnits;
+    };
+
+    struct component_info_t {
+        bool    masked      : 1;
+        bool    inDest      : 1;
+        bool    needed      : 1;
+        bool    replaced    : 1;
+        bool    iterated    : 1;
+        bool    smooth      : 1;
+        bool    blend       : 1;
+        bool    fog         : 1;
+    };
+
+    struct builder_context_t {
+        context_t const*    c;
+        needs_t             needs;
+        int                 Rctx;
+    };
+
+    template <typename T>
+    void modify(T& r, Scratch& regs)
+    {
+        if (!(r.flags & CORRUPTIBLE)) {
+            r.reg = regs.obtain();
+            r.flags |= CORRUPTIBLE;
+        }
+    }
+
+    // helpers
+    void    base_offset(pointer_t& d, pointer_t& b, const reg_t& o);
+
+    // texture environement
+    void    modulate(   component_t& dest,
+                        const component_t& incoming,
+                        const pixel_t& texel, int component);
+
+    void    decal(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    void    blend(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component, int tmu);
+
+    void    add(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    // load/store stuff
+    void    store(const pointer_t& addr, const pixel_t& src, uint32_t flags=0);
+    void    load(pointer_t& addr, const pixel_t& dest, uint32_t flags=0);
+
+    void    extract(integer_t& d, const pixel_t& s, int component);
+    void    extract(component_t& d, const pixel_t& s, int component);
+    void    extract(integer_t& d, int s, int h, int l, int bits=32);
+    void    expand(integer_t& d, const integer_t& s, int dbits);
+    void    expand(integer_t& d, const component_t& s, int dbits);
+    void    expand(component_t& d, const component_t& s, int dbits);
+    void    downshift(pixel_t& d, int component, component_t s, reg_t& dither);
+
+
+    void    mul_factor( component_t& d,
+                        const integer_t& v,
+                        const integer_t& f, Scratch& scratches);
+
+    void    mul_factor_add( component_t& d,
+                            const integer_t& v,
+                            const integer_t& f,
+                            const component_t& a);
+
+    void    component_add(  component_t& d,
+                            const integer_t& dst,
+                            const integer_t& src);
+
+    void    component_sat(  const component_t& v, const int temp_reg);
+
+
+    void    build_scanline_preparation(fragment_parts_t& parts,
+                                    const needs_t& needs);
+
+    void    build_smooth_shade(fragment_parts_t& parts);
+
+    void    build_component(    pixel_t& pixel,
+                                fragment_parts_t& parts,
+                                int component,
+                                Scratch& global_scratches);
+
+    void    build_incoming_component(
+                                component_t& temp,
+                                int dst_size,
+                                fragment_parts_t& parts,
+                                int component,
+                                Scratch& scratches,
+                                Scratch& global_scratches);
+
+    void    init_iterated_color(fragment_parts_t& parts, const reg_t& x);
+
+    void    build_iterated_color(   component_t& fragment,
+                                    fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& regs);
+
+    void    decodeLogicOpNeeds(const needs_t& needs);
+
+    void    decodeTMUNeeds(const needs_t& needs, context_t const* c);
+
+    void    init_textures(  tex_coord_t* coords,
+                            const reg_t& x,
+                            const reg_t& y);
+
+    void    build_textures( fragment_parts_t& parts,
+                            Scratch& regs);
+
+    void    filter8(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        reg_t reg_U, reg_t reg_V, pointer_t& txPtr,
+                        int FRAC_BITS, Scratch& scratches);
+
+    void    filter16(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        reg_t reg_U, reg_t reg_V, pointer_t& txPtr,
+                        int FRAC_BITS, Scratch& scratches);
+
+    void    filter24(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter32(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        reg_t reg_U, reg_t reg_V, pointer_t& txPtr,
+                        int FRAC_BITS, Scratch& scratches);
+
+    void    build_texture_environment(  component_t& fragment,
+                                        fragment_parts_t& parts,
+                                        int component,
+                                        Scratch& regs);
+
+    void    wrapping(   int d,
+                        int coord, int size,
+                        int tx_wrap, int tx_linear, Scratch& scratches);
+
+    void    build_fog(  component_t& temp,
+                        int component,
+                        Scratch& parent_scratches);
+
+    void    build_blending(     component_t& in_out,
+                                pixel_t& pixel,
+                                int component,
+                                Scratch& parent_scratches);
+
+    void    build_blend_factor(
+                integer_t& factor, int f, int component,
+                const pixel_t& dst_pixel,
+                integer_t& fragment,
+                integer_t& fb,
+                Scratch& scratches);
+
+    void    build_blendFOneMinusF(  component_t& temp,
+                                    const integer_t& factor,
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void    build_blendOneMinusFF(  component_t& temp,
+                                    const integer_t& factor,
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void build_coverage_application(component_t& fragment,
+                                    fragment_parts_t& parts,
+                                    Scratch& regs);
+
+    void build_alpha_test(component_t& fragment, const fragment_parts_t& parts);
+
+    enum { Z_TEST=1, Z_WRITE=2 };
+    void build_depth_test(const fragment_parts_t& parts, uint32_t mask);
+    void build_iterate_z(const fragment_parts_t& parts);
+    void build_iterate_f(const fragment_parts_t& parts);
+    void build_iterate_texture_coordinates(const fragment_parts_t& parts);
+
+    void build_logic_op(pixel_t& pixel, Scratch& regs);
+
+    void build_masking(pixel_t& pixel, Scratch& regs);
+
+    void build_and_immediate(int d, int s, uint32_t mask, int bits);
+
+    bool    isAlphaSourceNeeded() const;
+
+    enum {
+        FACTOR_SRC=1, FACTOR_DST=2, BLEND_SRC=4, BLEND_DST=8
+    };
+
+    enum {
+        LOGIC_OP=1, LOGIC_OP_SRC=2, LOGIC_OP_DST=4
+    };
+
+    static int blending_codes(int fs, int fd);
+
+    builder_context_t   mBuilderContext;
+    texture_machine_t   mTextureMachine;
+    component_info_t    mInfo[4];
+    int                 mBlending;
+    int                 mMasking;
+    int                 mAllMasked;
+    int                 mLogicOp;
+    int                 mAlphaTest;
+    int                 mAA;
+    int                 mDithering;
+    int                 mDepthTest;
+
+    int             mSmooth;
+    int             mFog;
+    pixel_t         mDstPixel;
+
+    GGLFormat       mCbFormat;
+
+    int             mBlendFactorCached;
+    integer_t       mAlphaSource;
+
+    int             mBaseRegister;
+
+    int             mBlendSrc;
+    int             mBlendDst;
+    int             mBlendSrcA;
+    int             mBlendDstA;
+
+    int             mOptLevel;
+
+    // to stretch esp and shrink esp
+    int             mCurSp;
+};
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif // ANDROID_GGLX86ASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/x86/X86Assembler.cpp b/libpixelflinger/codeflinger/x86/X86Assembler.cpp
new file mode 100644
index 0000000..2a717ac
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/X86Assembler.cpp
@@ -0,0 +1,618 @@
+/* libs/pixelflinger/codeflinger/x86/X86Assembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "X86Assembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cutils/log.h>
+#include <cutils/properties.h>
+#include <string.h>
+
+#if defined(WITH_LIB_HARDWARE)
+#include <hardware_legacy/qemu_tracing.h>
+#endif
+
+#include <private/pixelflinger/ggl_context.h>
+
+#include "codeflinger/CodeCache.h"
+#include "codeflinger/x86/X86Assembler.h"
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+X86Assembler::X86Assembler(const sp<Assembly>& assembly)
+    :  mAssembly(assembly)
+{
+    mBase = mStream = (char *)assembly->base();
+    mDuration = ggl_system_time();
+#if defined(WITH_LIB_HARDWARE)
+    mQemuTracing = true;
+#endif
+}
+
+X86Assembler::~X86Assembler()
+{
+}
+
+char* X86Assembler::pc() const
+{
+    return mStream;
+}
+
+char* X86Assembler::base() const
+{
+    return mBase;
+}
+
+void X86Assembler::reset()
+{
+    mBase = mStream = (char *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+// ----------------------------------------------------------------------------
+
+void X86Assembler::disassemble(const char* name)
+{
+    if (name) {
+        printf("%s:\n", name);
+    }
+    size_t count = pc()-base();
+    unsigned insLength;
+    unsigned insSize;
+    char* curStream = (char*)base();
+    while (count>0) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(curStream);
+        if (label >= 0) {
+            printf("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(curStream);
+        if (comment >= 0) {
+            printf("; %s\n", mComments.valueAt(comment));
+        }
+        insLength = decodeThenPrint(curStream);
+        curStream = curStream + insLength;
+        count = count - insLength;
+    }
+}
+
+void X86Assembler::comment(const char* string)
+{
+    mComments.add(mStream, string);
+}
+
+void X86Assembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, mStream);
+    mLabelsInverseMapping.add(mStream, theLabel);
+}
+
+//the conditional jump
+void X86Assembler::JCC(Mnemonic cc, const char* label) {
+    switch (cc) {
+    case Mnemonic_JO:
+        encoder_imm(Mnemonic_JO, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JNO:
+        encoder_imm(Mnemonic_JNO, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JB:
+        encoder_imm(Mnemonic_JB, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JNB:
+        encoder_imm(Mnemonic_JNB, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JZ:
+        encoder_imm(Mnemonic_JZ, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JNZ:
+        encoder_imm(Mnemonic_JNZ, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JBE:
+        encoder_imm(Mnemonic_JBE, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JNBE:
+        encoder_imm(Mnemonic_JNBE, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JS:
+        encoder_imm(Mnemonic_JS, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JNS:
+        encoder_imm(Mnemonic_JNS, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JP:
+        encoder_imm(Mnemonic_JP, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JNP:
+        encoder_imm(Mnemonic_JNP, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JL:
+        encoder_imm(Mnemonic_JL, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JNL:
+        encoder_imm(Mnemonic_JNL, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JLE:
+        encoder_imm(Mnemonic_JLE, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    case Mnemonic_JNLE:
+        encoder_imm(Mnemonic_JNLE, OpndSize_32,  0/*imm*/, mStream);
+        break;
+    default :
+        printf("the condition is not supported.\n");
+        return;
+    }
+    mStreamNext = mStream + encoder_get_inst_size(mStream);
+    //the offset is relative to the next instruction of the current PC
+    mBranchTargets.add(branch_target_t(label, mStream, mStreamNext));
+    mStream = mStreamNext;
+}
+
+void X86Assembler::JMP(const char* label) {
+    encoder_imm(Mnemonic_JMP, OpndSize_32,  0/*imm*/, mStream);
+    mStreamNext = mStream + encoder_get_inst_size(mStream);
+    mBranchTargets.add(branch_target_t(label, mStream, mStreamNext));
+    mStream = mStreamNext;
+}
+
+void X86Assembler::prepare_esp(int old_offset)
+{
+    mStreamUpdate = mStream;
+    SUB_IMM_TO_REG(old_offset, ESP);
+}
+
+void X86Assembler::update_esp(int new_offset)
+{
+    encoder_update_imm_rm(new_offset, mStreamUpdate);
+}
+
+void X86Assembler::shrink_esp(int shrink_offset)
+{
+    ADD_IMM_TO_REG(shrink_offset, ESP);
+}
+
+void X86Assembler::callee_work()
+{
+    //push EBX, ESI, EDI which need to be done in callee
+    /*
+    push %ebp
+    mov  %esp,%ebp
+    push %ebx
+    push %esi
+    push %edi
+    */
+    PUSH(EBP);
+    MOV_REG_TO_REG(ESP, EBP);
+    PUSH(EBX);
+    PUSH(ESI);
+    PUSH(EDI);
+}
+
+void X86Assembler::return_work()
+{
+// pop  %esi
+// pop  %edi
+// pop  %ebx
+// movl %ebp,%esp
+// pop  %ebp
+// ret
+// ret is equivalent to below
+// pop  %eax  // the return address
+// jmp  *%eax
+    POP(EDI);
+    POP(ESI);
+    POP(EBX);
+    POP(EBP);
+    encoder_return(mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+int X86Assembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        char* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                            "error resolving branch targets, target_pc is null");
+        //the offset is relative to the next instruction of the current PC
+        int32_t offset = int32_t(target_pc - bt.next_pc);
+        encoder_update_imm(offset, bt.pc);
+    }
+
+    mAssembly->resize((int)(pc()-base()));
+
+    // the instruction cache is flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins size) at [%p:%p] in %lld ns\n";
+    ALOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+#if defined(WITH_LIB_HARDWARE)
+    if (__builtin_expect(mQemuTracing, 0)) {
+        int err = qemu_add_mapping(uintptr_t(base()), name);
+        mQemuTracing = (err >= 0);
+    }
+#endif
+
+    char value[PROPERTY_VALUE_MAX];
+    property_get("debug.pf.disasm", value, "0");
+    if (atoi(value) != 0) {
+        printf(format, name, int(pc()-base()), base(), pc(), duration);
+        disassemble(name);
+    }
+
+    return NO_ERROR;
+}
+
+char* X86Assembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+// ----------------------------------------------------------------------------
+
+void X86Assembler::PUSH(int reg) {
+    encoder_reg(Mnemonic_PUSH, OpndSize_32, reg, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::POP(int reg) {
+    encoder_reg(Mnemonic_POP, OpndSize_32, reg, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+//arithmetic
+void X86Assembler::ADD_REG_TO_REG(int src, int dst) {
+    encoder_reg_reg(Mnemonic_ADD, OpndSize_32, src, 0/*isPhysical*/, dst/*dst is the destination*/, 0/*isPhysical2*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::ADD_IMM_TO_REG(int imm, int dst) {
+    encoder_imm_reg(Mnemonic_ADD, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::ADD_IMM_TO_MEM(int imm, int disp, int dst) {
+    encoder_imm_mem(Mnemonic_ADD, OpndSize_32, imm, disp, dst, 0/*isBasePhysical*/, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::ADD_MEM_TO_REG(int base_reg, int disp, int dst) {
+    encoder_mem_reg(Mnemonic_ADD, OpndSize_32, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::ADD_REG_TO_MEM(int src, int base_reg, int disp) {
+    encoder_reg_mem(Mnemonic_ADD, OpndSize_32, src, 0/*isPhysical*/, disp, base_reg, 0/*isBasePhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::SUB_REG_TO_REG(int src, int dst) {
+    encoder_reg_reg(Mnemonic_SUB, OpndSize_32, src, 0/*isPhysical*/, dst/*dst is the destination*/, 0/*isPhysical2*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::SUB_IMM_TO_REG(int imm, int dst) {
+    encoder_imm_reg(Mnemonic_SUB, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::SUB_IMM_TO_MEM(int imm, int disp, int dst) {
+    encoder_imm_mem(Mnemonic_SUB, OpndSize_32, imm, disp, dst, 0/*isBasePhysical*/, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::SUB_REG_TO_MEM(int src, int base_reg, int disp) {
+    encoder_reg_mem(Mnemonic_SUB, OpndSize_32, src, 0/*isPhysical*/, disp, base_reg, 0/*isBasePhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+//test
+void X86Assembler::TEST_REG_TO_REG(int src, int dst, OpndSize size) {
+    encoder_reg_reg(Mnemonic_TEST, size, src, 0/*isPhysical*/, dst/*dst is the destination*/, 0/*isPhysical2*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+//compare
+void X86Assembler::CMP_REG_TO_REG(int src, int dst, OpndSize size) {
+    encoder_reg_reg(Mnemonic_CMP, size, src, 0/*isPhysical*/, dst/*dst is the destination*/, 0/*isPhysical2*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::CMP_IMM_TO_REG(int imm, int dst) {
+    encoder_imm_reg(Mnemonic_CMP, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::CMP_MEM_TO_REG(int base_reg, int disp, int dst, OpndSize size) {
+    encoder_mem_reg(Mnemonic_CMP, size, disp, base_reg, 0/*isBasePhysical*/,
+                    dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::CMP_REG_TO_MEM(int reg, int disp, int base_reg, OpndSize size)
+{
+    encoder_reg_mem(Mnemonic_CMP, size, reg, 0/*isPhysical*/, disp, base_reg, 0/*isBasePhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+//logical
+void X86Assembler::AND_REG_TO_REG(int src, int dst) {
+    encoder_reg_reg(Mnemonic_AND, OpndSize_32, src, 0/*isPhysical*/, dst/*dst is the destination*/, 0/*isPhysical2*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::AND_IMM_TO_REG(int imm, int dst) {
+    encoder_imm_reg(Mnemonic_AND, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::OR_REG_TO_REG(int src, int dst) {
+    encoder_reg_reg(Mnemonic_OR, OpndSize_32, src, 0/*isPhysical*/, dst/*dst is the destination*/, 0/*isPhysical2*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::XOR(int src, int dst) {
+    encoder_reg_reg(Mnemonic_XOR, OpndSize_32, src, 0/*isPhysical*/, dst/*dst is the destination*/, 0/*isPhysical2*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::OR_IMM_TO_REG(int imm, int dst) {
+    encoder_imm_reg(Mnemonic_OR, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::NOT(int dst) {
+    encoder_reg(Mnemonic_NOT, OpndSize_32, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::NEG(int dst) {
+    encoder_reg(Mnemonic_NEG, OpndSize_32, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+//shift
+void X86Assembler::SHL(int imm, int dst) {
+    encoder_imm_reg(Mnemonic_SHL, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::SHL(int imm, int disp, int dst) {
+    encoder_imm_mem(Mnemonic_SHL, OpndSize_32, imm, disp, dst, 0/*isBasePhysical*/, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::SHR(int imm, int dst) {
+    encoder_imm_reg(Mnemonic_SHR, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::SHR(int imm, int disp, int dst) {
+    encoder_imm_mem(Mnemonic_SHR, OpndSize_32, imm, disp, dst, 0/*isBasePhysical*/, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::SAR(int imm, int dst) {
+    encoder_imm_reg(Mnemonic_SAR, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::ROR(const int imm, int dst) {
+    encoder_imm_reg(Mnemonic_ROR, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::ROR(int imm, int disp, int dst) {
+    encoder_imm_mem(Mnemonic_ROR, OpndSize_32, imm, disp, dst, 0/*isBasePhysical*/, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+//signed extension
+void X86Assembler::MOVSX_MEM_TO_REG(OpndSize size, int base_reg, int disp, int dst) {
+    encoder_moves_mem_to_reg(size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::MOVSX_REG_TO_REG(OpndSize size, int src, int dst) {
+    encoder_moves_reg_to_reg(size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+//zero entension
+void X86Assembler::MOVZX_MEM_TO_REG(OpndSize size, int base_reg, int disp, int dst) {
+    encoder_movez_mem_to_reg(size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::MOVZX_REG_TO_REG(OpndSize size, int src, int dst) {
+    encoder_movez_reg_to_reg(size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+// multiply...
+// the first source operand is placed in EAX
+void X86Assembler::IMUL(int reg) {
+    encoder_reg(Mnemonic_IMUL, OpndSize_32, reg, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::IMUL(int src, int dst) {
+    encoder_reg_reg(Mnemonic_IMUL, OpndSize_32, src, 0/*isPhysical*/, dst/*dst is the destination*/, 0/*isPhysical2*/,LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::MUL(int reg) {
+    encoder_reg(Mnemonic_MUL, OpndSize_32, reg, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+
+// data transfer...
+void X86Assembler::MOV_IMM_TO_REG(int32_t imm, int dst) {
+    encoder_imm_reg(Mnemonic_MOV, OpndSize_32, imm, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::MOV_REG_TO_REG(int src, int dst, OpndSize size)
+{
+    if(src == dst) return;
+    encoder_reg_reg(Mnemonic_MOV, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::MOV_REG_TO_MEM(int reg, int disp, int base_reg, OpndSize size)
+{
+    encoder_reg_mem(Mnemonic_MOV, size, reg, 0/*isPhysical*/, disp, base_reg, 0/*isBasePhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::MOV_MEM_TO_REG(int disp, int base_reg, int reg, OpndSize size)
+{
+    encoder_mem_reg(Mnemonic_MOV, size, disp, base_reg, 0/*isBasePhysical*/,
+                    reg, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::MOV_MEM_SCALE_TO_REG(int base_reg, int index_reg, int scale, int reg, OpndSize size)
+{
+    encoder_mem_scale_reg(Mnemonic_MOV, size, base_reg, 0/*isBasePhysical*/, index_reg, 0/*isIndexPhysical*/, scale, reg, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+// the conditional move
+void X86Assembler::CMOV_REG_TO_REG(Mnemonic cc, int src, int dst, OpndSize size)
+{
+    switch (cc) {
+    case Mnemonic_CMOVO:
+        encoder_reg_reg(Mnemonic_CMOVO, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNO:
+        encoder_reg_reg(Mnemonic_CMOVNO, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVB:
+        encoder_reg_reg(Mnemonic_CMOVB, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNB:
+        encoder_reg_reg(Mnemonic_CMOVNB, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVZ:
+        encoder_reg_reg(Mnemonic_CMOVZ, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNZ:
+        encoder_reg_reg(Mnemonic_CMOVNZ, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVBE:
+        encoder_reg_reg(Mnemonic_CMOVBE, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNBE:
+        encoder_reg_reg(Mnemonic_CMOVNBE, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVS:
+        encoder_reg_reg(Mnemonic_CMOVS, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNS:
+        encoder_reg_reg(Mnemonic_CMOVNS, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVP:
+        encoder_reg_reg(Mnemonic_CMOVP, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNP:
+        encoder_reg_reg(Mnemonic_CMOVNP, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVL:
+        encoder_reg_reg(Mnemonic_CMOVL, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNL:
+        encoder_reg_reg(Mnemonic_CMOVNL, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVLE:
+        encoder_reg_reg(Mnemonic_CMOVLE, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNLE:
+        encoder_reg_reg(Mnemonic_CMOVNLE, size, src, 0/*isPhysical*/, dst, 0/*isPhysical2*/, LowOpndRegType_gp, mStream);
+        break;
+    default :
+        printf("the condition is not supported.\n");
+        return;
+    }
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+void X86Assembler::CMOV_MEM_TO_REG(Mnemonic cc, int disp, int base_reg, int dst, OpndSize size)
+{
+    switch (cc) {
+    case Mnemonic_CMOVO:
+        encoder_mem_reg(Mnemonic_CMOVO, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNO:
+        encoder_mem_reg(Mnemonic_CMOVNO, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVB:
+        encoder_mem_reg(Mnemonic_CMOVB, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNB:
+        encoder_mem_reg(Mnemonic_CMOVNB, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVZ:
+        encoder_mem_reg(Mnemonic_CMOVZ, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNZ:
+        encoder_mem_reg(Mnemonic_CMOVNZ, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVBE:
+        encoder_mem_reg(Mnemonic_CMOVBE, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNBE:
+        encoder_mem_reg(Mnemonic_CMOVNBE, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVS:
+        encoder_mem_reg(Mnemonic_CMOVS, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNS:
+        encoder_mem_reg(Mnemonic_CMOVNS, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVP:
+        encoder_mem_reg(Mnemonic_CMOVP, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNP:
+        encoder_mem_reg(Mnemonic_CMOVNP, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVL:
+        encoder_mem_reg(Mnemonic_CMOVL, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNL:
+        encoder_mem_reg(Mnemonic_CMOVNL, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVLE:
+        encoder_mem_reg(Mnemonic_CMOVLE, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    case Mnemonic_CMOVNLE:
+        encoder_mem_reg(Mnemonic_CMOVNLE, size, disp, base_reg, 0/*isBasePhysical*/, dst, 0/*isPhysical*/, LowOpndRegType_gp, mStream);
+        break;
+    default :
+        printf("the condition is not supported.\n");
+        return;
+    }
+    mStream = mStream + encoder_get_inst_size(mStream);
+}
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/x86/X86Assembler.h b/libpixelflinger/codeflinger/x86/X86Assembler.h
new file mode 100644
index 0000000..03502d5
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/X86Assembler.h
@@ -0,0 +1,163 @@
+/* libs/pixelflinger/codeflinger/x86/X86Assembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef ANDROID_X86ASSEMBLER_H
+#define ANDROID_X86ASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <utils/Vector.h>
+#include <utils/KeyedVector.h>
+
+#include "codeflinger/tinyutils/smartpointer.h"
+#include "codeflinger/CodeCache.h"
+#include "enc_wrapper.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class X86Assembler
+{
+public:
+
+    enum {
+        EAX = PhysicalReg_EAX, EBX = PhysicalReg_EBX, ECX = PhysicalReg_ECX,
+        EDX = PhysicalReg_EDX, EDI = PhysicalReg_EDI, ESI = PhysicalReg_ESI,
+        ESP = PhysicalReg_ESP, EBP = PhysicalReg_EBP
+    };
+
+    X86Assembler(const sp<Assembly>& assembly);
+    ~X86Assembler();
+
+    char*   base() const;
+    char*   pc() const;
+
+
+    void        disassemble(const char* name);
+
+    // ------------------------------------------------------------------------
+    // X86AssemblerInterface...
+    // ------------------------------------------------------------------------
+
+    void    reset();
+
+    int     generate(const char* name);
+
+    void    comment(const char* string);
+
+    void    label(const char* theLabel);
+
+    void    JCC(Mnemonic cc, const char* label);
+
+    void    JMP(const char* label);
+
+    void    prepare_esp(int old_offset);
+
+    void    update_esp(int new_offset);
+
+    void    shrink_esp(int shrink_offset);
+
+    void    callee_work();
+
+    void    return_work();
+
+    char*   pcForLabel(const char* label);
+
+    void    PUSH(int reg);
+
+    void    POP(int reg);
+
+    void    ADD_REG_TO_REG(int src, int dst);
+    void    ADD_IMM_TO_REG(int imm, int dst);
+    void    ADD_IMM_TO_MEM(int imm, int disp, int dst);
+    void    ADD_MEM_TO_REG(int base_reg, int disp, int dst);
+    void    ADD_REG_TO_MEM(int src, int base_reg, int disp);
+    void    SUB_REG_TO_REG(int src, int dst);
+    void    SUB_IMM_TO_REG(int imm, int dst);
+    void    SUB_IMM_TO_MEM(int imm, int disp, int dst);
+    void    SUB_REG_TO_MEM(int src, int base_reg, int disp);
+
+    void    TEST_REG_TO_REG(int src, int dst, OpndSize size=OpndSize_32);
+    void    CMP_REG_TO_REG(int src, int dst, OpndSize size=OpndSize_32);
+    void    CMP_MEM_TO_REG(int base_reg, int disp, int dst, OpndSize size=OpndSize_32);
+    void    CMP_REG_TO_MEM(int reg, int disp, int base_reg, OpndSize size=OpndSize_32);
+    void    CMP_IMM_TO_REG(int imm, int dst);
+
+    void    AND_REG_TO_REG(int src, int dst);
+    void    AND_IMM_TO_REG(int imm, int dst);
+    void    OR_REG_TO_REG(int src, int dst);
+    void    XOR(int src, int dst);
+    void    OR_IMM_TO_REG(int imm, int dst);
+    void    NOT(int dst);
+    void    NEG(int dst);
+    void    SHL(int imm, int dst);
+    void    SHL(int imm, int disp, int dst);
+    void    SHR(int imm, int dst);
+    void    SHR(int imm, int disp, int dst);
+    void    SAR(int imm, int dst);
+    void    ROR(const int imm, int dst);
+    void    ROR(int imm, int disp, int dst);
+    void    IMUL(int reg);
+    void    IMUL(int src, int dst);
+    void    MUL(int reg);
+
+    void    MOVSX_MEM_TO_REG(OpndSize size, int base_reg, int disp, int dst);
+    void    MOVSX_REG_TO_REG(OpndSize size, int src, int dst);
+    void    MOVZX_MEM_TO_REG(OpndSize size, int base_reg, int disp, int dst);
+    void    MOVZX_REG_TO_REG(OpndSize size, int src, int dst);
+    void    MOV_IMM_TO_REG(int32_t imm, int dst);
+    void    MOV_REG_TO_REG(int src, int dst, OpndSize size=OpndSize_32);
+    void    MOV_MEM_TO_REG(int disp, int base_reg, int reg, OpndSize size=OpndSize_32);
+    void    MOV_REG_TO_MEM(int reg, int disp, int base_reg, OpndSize size=OpndSize_32);
+    void    MOV_MEM_SCALE_TO_REG(int base_reg, int index_reg, int scale, int reg, OpndSize size=OpndSize_32);
+    void    CMOV_REG_TO_REG(Mnemonic cc, int src, int dst, OpndSize size=OpndSize_32);
+    void    CMOV_MEM_TO_REG(Mnemonic cc, int disp, int base_reg, int dst, OpndSize size=OpndSize_32);
+
+
+    sp<Assembly>    mAssembly;
+    char*           mBase;
+    char*           mStream;
+    //branch target offset is relative to the next instruction
+    char*           mStreamNext;
+    //updating esp after iterating the loop
+    char*           mStreamUpdate;
+
+    int64_t         mDuration;
+#if defined(WITH_LIB_HARDWARE)
+    bool            mQemuTracing;
+#endif
+
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0), next_pc(0) { }
+        inline branch_target_t(const char* l, char* p, char* next_p)
+            : label(l), pc(p), next_pc(next_p) { }
+        const char* label;
+        char*   pc;
+        char*   next_pc;
+    };
+
+    Vector<branch_target_t>             mBranchTargets;
+    KeyedVector< const char*, char* >   mLabels;
+    KeyedVector< char*, const char* >   mLabelsInverseMapping;
+    KeyedVector< char*, const char* >   mComments;
+};
+
+}; // namespace android
+
+#endif //ANDROID_X86ASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/x86/blending.cpp b/libpixelflinger/codeflinger/x86/blending.cpp
new file mode 100644
index 0000000..f918ffd
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/blending.cpp
@@ -0,0 +1,974 @@
+/* libs/pixelflinger/codeflinger/x86/blending.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <cutils/log.h>
+
+#include "codeflinger/x86/GGLX86Assembler.h"
+
+
+namespace android {
+
+void GGLX86Assembler::build_fog(
+    component_t& temp,      // incomming fragment / output
+    int component,
+    Scratch& regs)
+{
+    if (mInfo[component].fog) {
+        Scratch scratches(registerFile());
+        comment("fog");
+
+        temp.reg = scratches.obtain();
+        MOV_MEM_TO_REG(temp.offset_ebp, EBP, temp.reg);
+        integer_t fragment(temp.reg, temp.h, temp.flags, temp.offset_ebp);
+        if (!(temp.flags & CORRUPTIBLE)) {
+            temp.reg = regs.obtain();
+            temp.flags |= CORRUPTIBLE;
+        }
+
+        integer_t fogColor(scratches.obtain(), 8, CORRUPTIBLE);
+        mBuilderContext.Rctx = scratches.obtain();
+        MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+        MOVZX_MEM_TO_REG(OpndSize_8, mBuilderContext.Rctx, GGL_OFFSETOF(state.fog.color[component]), fogColor.reg);
+
+        integer_t factor(scratches.obtain(), 16, CORRUPTIBLE);
+        CONTEXT_LOAD(factor.reg, generated_vars.f);
+        scratches.recycle(mBuilderContext.Rctx);
+
+        // clamp fog factor (TODO: see if there is a way to guarantee
+        // we won't overflow, when setting the iterators)
+        int temp_reg = scratches.obtain();
+        MOV_REG_TO_REG(factor.reg, temp_reg);
+        SAR(31, temp_reg);
+        NOT(temp_reg);
+        AND_REG_TO_REG(temp_reg, factor.reg);
+        MOV_IMM_TO_REG(0x10000, temp_reg);
+        CMP_IMM_TO_REG(0x10000, factor.reg);
+        CMOV_REG_TO_REG(Mnemonic_CMOVAE, temp_reg, factor.reg);
+        scratches.recycle(temp_reg);
+
+        //we will resue factor.reg
+        build_blendFOneMinusF(temp, factor, fragment, fogColor);
+        MOV_REG_TO_MEM(temp.reg, temp.offset_ebp, EBP);
+        scratches.recycle(temp.reg);
+    }
+}
+
+void GGLX86Assembler::build_blending(
+    component_t& temp,      // incomming fragment / output
+    pixel_t& pixel,   // framebuffer
+    int component,
+    Scratch& regs)
+{
+    if (!mInfo[component].blend)
+        return;
+
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA)
+        fs = GGL_ONE;
+    const int blending = blending_codes(fs, fd);
+    if (!temp.size()) {
+        // here, blending will produce something which doesn't depend on
+        // that component (eg: GL_ZERO:GL_*), so the register has not been
+        // allocated yet. Will never be used as a source.
+        //temp = component_t(regs.obtain(), CORRUPTIBLE, temp_offset_ebp);
+        temp.reg = regs.obtain();
+        temp.flags = CORRUPTIBLE;
+        temp.h = temp.l = 0;
+    } else {
+        temp.reg = regs.obtain();
+    }
+    MOV_MEM_TO_REG(temp.offset_ebp, EBP, temp.reg);
+    // we are doing real blending...
+    // fb:          extracted dst
+    // fragment:    extracted src
+    // temp:        component_t(fragment) and result
+
+    // scoped register allocator
+    Scratch scratches(registerFile());
+    comment("blending");
+
+    // we can optimize these cases a bit...
+    // (1) saturation is not needed
+    // (2) we can use only one multiply instead of 2
+    // (3) we can reduce the register pressure
+    //      R = S*f + D*(1-f) = (S-D)*f + D
+    //      R = S*(1-f) + D*f = (D-S)*f + S
+
+    const bool same_factor_opt1 =
+        (fs==GGL_DST_COLOR && fd==GGL_ONE_MINUS_DST_COLOR) ||
+        (fs==GGL_SRC_COLOR && fd==GGL_ONE_MINUS_SRC_COLOR) ||
+        (fs==GGL_DST_ALPHA && fd==GGL_ONE_MINUS_DST_ALPHA) ||
+        (fs==GGL_SRC_ALPHA && fd==GGL_ONE_MINUS_SRC_ALPHA);
+
+    const bool same_factor_opt2 =
+        (fs==GGL_ONE_MINUS_DST_COLOR && fd==GGL_DST_COLOR) ||
+        (fs==GGL_ONE_MINUS_SRC_COLOR && fd==GGL_SRC_COLOR) ||
+        (fs==GGL_ONE_MINUS_DST_ALPHA && fd==GGL_DST_ALPHA) ||
+        (fs==GGL_ONE_MINUS_SRC_ALPHA && fd==GGL_SRC_ALPHA);
+
+
+    // XXX: we could also optimize these cases:
+    // R = S*f + D*f = (S+D)*f
+    // R = S*(1-f) + D*(1-f) = (S+D)*(1-f)
+    // R = S*D + D*S = 2*S*D
+
+
+    pixel.reg = scratches.obtain();
+    MOV_MEM_TO_REG(pixel.offset_ebp, EBP, pixel.reg);
+    // see if we need to extract 'component' from the destination (fb)
+    integer_t fb;
+    if (blending & (BLEND_DST|FACTOR_DST)) {
+        fb.setTo(scratches.obtain(), 32);
+        extract(fb, pixel, component);
+        if (mDithering) {
+            // XXX: maybe what we should do instead, is simply
+            // expand fb -or- fragment to the larger of the two
+            if (fb.size() < temp.size()) {
+                // for now we expand 'fb' to min(fragment, 8)
+                int new_size = temp.size() < 8 ? temp.size() : 8;
+                expand(fb, fb, new_size);
+            }
+        }
+    }
+
+    // convert input fragment to integer_t
+    if (temp.l && (temp.flags & CORRUPTIBLE)) {
+        SHR(temp.l, temp.reg);
+        temp.h -= temp.l;
+        temp.l = 0;
+    }
+    integer_t fragment(temp.reg, temp.size(), temp.flags, temp.offset_ebp);
+
+    // if not done yet, convert input fragment to integer_t
+    if (temp.l) {
+        // here we know temp is not CORRUPTIBLE
+        fragment.reg = scratches.obtain();
+        MOV_REG_TO_REG(temp.reg, fragment.reg);
+        SHR(temp.l, fragment.reg);
+        fragment.flags |= CORRUPTIBLE;
+    }
+
+    if (!(temp.flags & CORRUPTIBLE)) {
+        // temp is not corruptible, but since it's the destination it
+        // will be modified, so we need to allocate a new register.
+        temp.reg = regs.obtain();
+        temp.flags &= ~CORRUPTIBLE;
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+    if ((blending & BLEND_SRC) && !same_factor_opt1) {
+        // source (fragment) is needed for the blending stage
+        // so it's not CORRUPTIBLE (unless we're doing same_factor_opt1)
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+
+    if (same_factor_opt1) {
+        //  R = S*f + D*(1-f) = (S-D)*f + D
+        integer_t factor;
+        build_blend_factor(factor, fs,
+                           component, pixel, fragment, fb, scratches);
+        // fb is always corruptible from this point
+        fb.flags |= CORRUPTIBLE;
+        //we will reuse factor in mul_factor_add of build_blendFOneMinusF, unless factor.reg == fragment.reg == temp.reg or factor.reg == fb.reg in build_blend_factor
+        if(factor.reg == fragment.reg || factor.reg == fb.reg)
+            MOV_REG_TO_REG(factor.reg, pixel.reg);
+        else
+            scratches.recycle(pixel.reg);
+        build_blendFOneMinusF(temp, factor, fragment, fb);
+        if(factor.reg == fragment.reg || factor.reg == fb.reg) {
+            MOV_REG_TO_REG(pixel.reg, factor.reg);
+            scratches.recycle(pixel.reg);
+        }
+        scratches.recycle(fb.reg);
+        //scratches.recycle(factor.reg);
+    } else if (same_factor_opt2) {
+        //  R = S*(1-f) + D*f = (D-S)*f + S
+        integer_t factor;
+        // fb is always corrruptible here
+        fb.flags |= CORRUPTIBLE;
+        build_blend_factor(factor, fd,
+                           component, pixel, fragment, fb, scratches);
+        //we will reuse factor in mul_factor_add of build_blendFOneMinusFF, unless factor.reg == fragment.reg == temp.reg or factor.reg == fb.reg in build_blend_factor
+        if(factor.reg == fragment.reg || factor.reg == fb.reg)
+            MOV_REG_TO_REG(factor.reg, pixel.reg);
+        else
+            scratches.recycle(pixel.reg);
+        build_blendOneMinusFF(temp, factor, fragment, fb);
+        if(factor.reg == fragment.reg || factor.reg == fb.reg) {
+            MOV_REG_TO_REG(pixel.reg, factor.reg);
+            scratches.recycle(pixel.reg);
+        }
+        scratches.recycle(fb.reg);
+    } else {
+        integer_t src_factor;
+        integer_t dst_factor;
+
+        // if destination (fb) is not needed for the blending stage,
+        // then it can be marked as CORRUPTIBLE
+        if (!(blending & BLEND_DST)) {
+            fb.flags |= CORRUPTIBLE;
+        }
+
+        // XXX: try to mark some registers as CORRUPTIBLE
+        // in most case we could make those corruptible
+        // when we're processing the last component
+        // but not always, for instance
+        //    when fragment is constant and not reloaded
+        //    when fb is needed for logic-ops or masking
+        //    when a register is aliased (for instance with mAlphaSource)
+
+        // blend away...
+        if (fs==GGL_ZERO) {
+            if (fd==GGL_ZERO) {         // R = 0
+                // already taken care of
+            } else if (fd==GGL_ONE) {   // R = D
+                // already taken care of
+            } else {                    // R = D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                                   component, pixel, fragment, fb, scratches);
+                scratches.recycle(pixel.reg);
+                mul_factor(temp, fb, dst_factor, regs);
+                scratches.recycle(fb.reg);
+            }
+        } else if (fs==GGL_ONE) {
+            int temp_reg;
+            if (fd==GGL_ZERO) {     // R = S
+                // NOP, taken care of
+            } else if (fd==GGL_ONE) {   // R = S + D
+                component_add(temp, fb, fragment); // args order matters
+                temp_reg = scratches.obtain();
+                component_sat(temp, temp_reg);
+                scratches.recycle(temp_reg);
+            } else {                    // R = S + D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                                   component, pixel, fragment, fb, scratches);
+                //we will probably change src_factor in mul_factor_add, unless factor.reg == fragment.reg == temp.reg or factor.reg == fb.reg in build_blend_factor
+                if(dst_factor.reg == fragment.reg || dst_factor.reg == fb.reg)
+                    MOV_REG_TO_REG(dst_factor.reg, pixel.reg);
+                else
+                    scratches.recycle(pixel.reg);
+                mul_factor_add(temp, fb, dst_factor, component_t(fragment));
+                if(dst_factor.reg == fragment.reg || dst_factor.reg == fb.reg) {
+                    MOV_REG_TO_REG(pixel.reg, dst_factor.reg);
+                    scratches.recycle(pixel.reg);
+                }
+                temp_reg = fb.reg;
+                component_sat(temp, temp_reg);
+                scratches.recycle(fb.reg);
+            }
+        } else {
+            // compute fs
+            int temp_reg;
+            build_blend_factor(src_factor, fs,
+                               component, pixel, fragment, fb, scratches);
+            if (fd==GGL_ZERO) {         // R = S*fs
+                mul_factor(temp, fragment, src_factor, regs);
+                if (scratches.isUsed(src_factor.reg))
+                    scratches.recycle(src_factor.reg);
+            } else if (fd==GGL_ONE) {   // R = S*fs + D
+                //we will probably change src_factor in mul_factor_add, unless factor.reg == fragment.reg == temp.reg or factor.reg == fb.reg in build_blend_factor
+                if(src_factor.reg == fragment.reg || src_factor.reg == fb.reg)
+                    MOV_REG_TO_REG(src_factor.reg, pixel.reg);
+                else
+                    scratches.recycle(pixel.reg);
+                mul_factor_add(temp, fragment, src_factor, component_t(fb));
+                if(src_factor.reg == fragment.reg || src_factor.reg == fb.reg) {
+                    MOV_REG_TO_REG(pixel.reg, src_factor.reg);
+                    scratches.recycle(pixel.reg);
+                }
+                temp_reg = fb.reg;
+                component_sat(temp, temp_reg);
+                scratches.recycle(fb.reg);
+            } else {                    // R = S*fs + D*fd
+                mul_factor(temp, fragment, src_factor, regs);
+                if (scratches.isUsed(src_factor.reg))
+                    scratches.recycle(src_factor.reg);
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                                   component, pixel, fragment, fb, scratches);
+                //we will probably change dst_factor in mul_factor_add, unless factor.reg == fragment.reg == temp.reg or factor.reg == fb.reg
+                if(dst_factor.reg == fragment.reg || dst_factor.reg == fb.reg)
+                    MOV_REG_TO_REG(dst_factor.reg, pixel.reg);
+                else
+                    scratches.recycle(pixel.reg);
+                mul_factor_add(temp, fb, dst_factor, temp);
+                if(dst_factor.reg == fragment.reg || dst_factor.reg == fb.reg) {
+                    MOV_REG_TO_REG(pixel.reg, dst_factor.reg);
+                    scratches.recycle(pixel.reg);
+                }
+                if (!same_factor_opt1 && !same_factor_opt2) {
+                    temp_reg = fb.reg;
+                    component_sat(temp, temp_reg);
+                }
+                scratches.recycle(fb.reg);
+            }
+            if(scratches.isUsed(pixel.reg))
+                scratches.recycle(pixel.reg);
+        }
+    }
+    // temp is modified, but it will be used immediately in downshift
+    //printf("temp.offset_ebp: %d \n", temp.offset_ebp);
+    //below will be triggered on CDK for surfaceflinger
+    if(temp.offset_ebp == mAlphaSource.offset_ebp) {
+        mCurSp = mCurSp - 4;
+        temp.offset_ebp = mCurSp;
+    }
+    // the r, g, b value must be stored, otherwise the color of globaltime is incorrect.
+    MOV_REG_TO_MEM(temp.reg, temp.offset_ebp, EBP);
+    regs.recycle(temp.reg);
+
+    // now we can be corrupted (it's the dest)
+    temp.flags |= CORRUPTIBLE;
+}
+
+void GGLX86Assembler::build_blend_factor(
+    integer_t& factor, int f, int component,
+    const pixel_t& dst_pixel,
+    integer_t& fragment,
+    integer_t& fb,
+    Scratch& scratches)
+{
+    integer_t src_alpha(fragment);
+
+    // src_factor/dst_factor won't be used after blending,
+    // so it's fine to mark them as CORRUPTIBLE (if not aliased)
+    factor.flags |= CORRUPTIBLE;
+    int temp_reg;
+    switch(f) {
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        if (component==GGLFormat::ALPHA && !isAlphaSourceNeeded()) {
+            // we're processing alpha, so we already have
+            // src-alpha in fragment, and we need src-alpha just this time.
+        } else {
+            // alpha-src will be needed for other components
+            factor = mAlphaSource;
+            factor.flags &= ~CORRUPTIBLE;
+            factor.reg = scratches.obtain();
+            //printf("mAlphaSource.offset_ebp: %d \n", mAlphaSource.offset_ebp);
+            //printf("fragment.offset_ebp: %d \n", fragment.offset_ebp);
+            //printf("factor.offset_ebp: %d \n", factor.offset_ebp);
+            MOV_MEM_TO_REG(mAlphaSource.offset_ebp, EBP, factor.reg);
+            if (!mBlendFactorCached || mBlendFactorCached==f) {
+                src_alpha = mAlphaSource;
+                // we already computed the blend factor before, nothing to do.
+                if (mBlendFactorCached)
+                    return;
+                // this is the first time, make sure to compute the blend
+                // factor properly.
+                mBlendFactorCached = f;
+                break;
+            } else {
+                // we have a cached alpha blend factor, but we want another one,
+                // this should really not happen because by construction,
+                // we cannot have BOTH source and destination
+                // blend factors use ALPHA *and* ONE_MINUS_ALPHA (because
+                // the blending stage uses the f/(1-f) optimization
+
+                // for completeness, we handle this case though. Since there
+                // are only 2 choices, this meens we want "the other one"
+                // (1-factor)
+                //factor = mAlphaSource;
+                //factor.flags &= ~CORRUPTIBLE;
+                NEG(factor.reg);
+                ADD_IMM_TO_REG((1<<factor.s), factor.reg);
+                MOV_REG_TO_MEM(factor.reg, factor.offset_ebp, EBP);
+                mBlendFactorCached = f;
+                return;
+            }
+        }
+        // fall-through...
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        // help us find out what register we can use for the blend-factor
+        // CORRUPTIBLE registers are chosen first, or a new one is allocated.
+        if (fragment.flags & CORRUPTIBLE) {
+            factor.setTo(fragment.reg, 32, CORRUPTIBLE, fragment.offset_ebp);
+            fragment.flags &= ~CORRUPTIBLE;
+        } else if (fb.flags & CORRUPTIBLE) {
+            factor.setTo(fb.reg, 32, CORRUPTIBLE, fb.offset_ebp);
+            fb.flags &= ~CORRUPTIBLE;
+        } else {
+            factor.setTo(scratches.obtain(), 32, CORRUPTIBLE);
+            mCurSp = mCurSp - 4;
+            factor.offset_ebp = mCurSp;
+        }
+        break;
+    }
+
+    // XXX: doesn't work if size==1
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        factor.s = fb.s;
+        MOV_REG_TO_REG(fb.reg, factor.reg);
+        SHR(fb.s-1, factor.reg);
+        ADD_REG_TO_REG(fb.reg, factor.reg);
+        break;
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+        factor.s = fragment.s;
+        temp_reg = scratches.obtain();
+        MOV_REG_TO_REG(fragment.reg, temp_reg);
+        SHR(fragment.s-1, fragment.reg);
+        ADD_REG_TO_REG(temp_reg, fragment.reg);
+        scratches.recycle(temp_reg);
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        factor.s = src_alpha.s;
+        if (mBlendFactorCached == f) {
+            //src_alpha == factor == mAlphaSource, we need a temp reg
+            if(scratches.countFreeRegs()) {
+                temp_reg = scratches.obtain();
+                MOV_REG_TO_REG(factor.reg, temp_reg);
+                SHR(src_alpha.s-1, factor.reg);
+                ADD_REG_TO_REG(temp_reg, factor.reg);
+                scratches.recycle(temp_reg);
+            }
+            else {
+                SHR(src_alpha.s-1, factor.offset_ebp, EBP);
+                ADD_MEM_TO_REG(EBP, factor.offset_ebp, factor.reg);
+            }
+        }
+        else
+        {
+            MOV_REG_TO_REG(src_alpha.reg, factor.reg);
+            SHR(src_alpha.s-1, factor.reg);
+            ADD_REG_TO_REG(src_alpha.reg, factor.reg);
+        }
+        // we will store factor in the next switch for GGL_ONE_MINUS_SRC_ALPHA
+        if(f == GGL_SRC_ALPHA)
+            MOV_REG_TO_MEM(factor.reg, factor.offset_ebp, EBP);
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // XXX: should be precomputed
+        extract(factor, dst_pixel, GGLFormat::ALPHA);
+        temp_reg = scratches.obtain();
+        MOV_REG_TO_REG(factor.reg, temp_reg);
+        SHR(factor.s-1, factor.reg);
+        ADD_REG_TO_REG(temp_reg, factor.reg);
+        scratches.recycle(temp_reg);
+        break;
+    case GGL_SRC_ALPHA_SATURATE:
+        // XXX: should be precomputed
+        // XXX: f = min(As, 1-Ad)
+        // btw, we're guaranteed that Ad's size is <= 8, because
+        // it's extracted from the framebuffer
+        break;
+    }
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_ONE_MINUS_SRC_ALPHA:
+        NEG(factor.reg);
+        ADD_IMM_TO_REG(1<<factor.s, factor.reg);
+        MOV_REG_TO_MEM(factor.reg, factor.offset_ebp, EBP);
+    }
+
+    // don't need more than 8-bits for the blend factor
+    // and this will prevent overflows in the multiplies later
+    if (factor.s > 8) {
+        SHR(factor.s-8, factor.reg);
+        factor.s = 8;
+        if(f == GGL_ONE_MINUS_SRC_ALPHA || f == GGL_SRC_ALPHA)
+            MOV_REG_TO_MEM(factor.reg, factor.offset_ebp, EBP);
+    }
+    //below will be triggered on CDK for surfaceflinger
+    if(fragment.offset_ebp == mAlphaSource.offset_ebp)
+        MOV_REG_TO_REG(factor.reg, fragment.reg);
+}
+
+int GGLX86Assembler::blending_codes(int fs, int fd)
+{
+    int blending = 0;
+    switch(fs) {
+    case GGL_ONE:
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // no need to extract 'component' from the destination
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    }
+    switch(fd) {
+    case GGL_ONE:
+        blending |= BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+        blending |= FACTOR_SRC|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        // no need to extract 'component' from the source
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_DST;
+        break;
+    }
+    return blending;
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::build_blendFOneMinusF(
+    component_t& temp,
+    const integer_t& factor,
+    const integer_t& fragment,
+    const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    // compute S-D
+    Scratch scratches(registerFile());
+    integer_t diff(fragment.flags & CORRUPTIBLE ?
+                   fragment.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0) {
+        MOV_REG_TO_REG(fragment.reg, diff.reg);
+        SHR(shift, diff.reg);
+        SUB_REG_TO_REG(fb.reg, diff.reg);
+    } else if (shift<0) {
+        MOV_REG_TO_REG(fragment.reg, diff.reg);
+        SHL(-shift, diff.reg);
+        SUB_REG_TO_REG(fb.reg, diff.reg);
+    } else  {
+        MOV_REG_TO_REG(fragment.reg, diff.reg);
+        SUB_REG_TO_REG(fb.reg, diff.reg);
+    }
+    mul_factor_add(temp, diff, factor, component_t(fb));
+    if(!(fragment.flags & CORRUPTIBLE))
+        scratches.recycle(diff.reg);
+}
+
+void GGLX86Assembler::build_blendOneMinusFF(
+    component_t& temp,
+    const integer_t& factor,
+    const integer_t& fragment,
+    const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute D-S
+    integer_t diff(fb.flags & CORRUPTIBLE ?
+                   fb.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0) {
+        SHR(shift, fragment.reg);
+        MOV_REG_TO_REG(fb.reg, diff.reg);
+        SUB_REG_TO_REG(fragment.reg, diff.reg);
+    }
+    else if (shift<0) {
+        SHR(-shift, fragment.reg);
+        MOV_REG_TO_REG(fb.reg, diff.reg);
+        SUB_REG_TO_REG(fragment.reg, diff.reg);
+    }
+    else    {
+        MOV_REG_TO_REG(fb.reg, diff.reg);
+        SUB_REG_TO_REG(fragment.reg, diff.reg);
+    }
+
+    mul_factor_add(temp, diff, factor, component_t(fragment));
+    if(!(fragment.flags & CORRUPTIBLE))
+        scratches.recycle(diff.reg);
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::mul_factor(  component_t& d,
+                                   const integer_t& v,
+                                   const integer_t& f, Scratch& scratches)
+{
+    // f can be changed
+    //
+    int vs = v.size();
+    int fs = f.size();
+    int ms = vs+fs;
+
+    // XXX: we could have special cases for 1 bit mul
+
+    // all this code below to use the best multiply instruction
+    // wrt the parameters size. We take advantage of the fact
+    // that the 16-bits multiplies allow a 16-bit shift
+    // The trick is that we just make sure that we have at least 8-bits
+    // per component (which is enough for a 8 bits display).
+
+    int xy = -1;
+    int vshift = 0;
+    int fshift = 0;
+    int smulw = 0;
+
+    int xyBB = 0;
+    int xyTB = 1;
+    int xyTT = 2;
+    int xyBT = 3;
+    if (vs<16) {
+        if (fs<16) {
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 16;
+            xy = xyTB;
+        } else {
+            // eg: 15 * 18  ->  15 * 15
+            fshift = fs - 15;
+            ms -= fshift;
+            xy = xyBB;
+        }
+    } else if (GGL_BETWEEN(vs, 24, 31)) {
+        if (fs<16) {
+            ms -= 16;
+            xy = xyTB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 32;
+            xy = xyTT;
+        } else {
+            // eg: 24 * 18  ->  8 * 18
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = xyTB;
+        }
+    } else {
+        if (fs<16) {
+            // eg: 18 * 15  ->  15 * 15
+            vshift = vs - 15;
+            ms -= vshift;
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            // eg: 18 * 24  ->  15 * 8
+            vshift = vs - 15;
+            ms -= 16 + vshift;
+            xy = xyBT;
+        } else {
+            // eg: 18 * 18  ->  (15 * 18)>>16
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            //xy = yB;    //XXX SMULWB
+            smulw = 1;
+        }
+    }
+
+    ALOGE_IF(ms>=32, "mul_factor overflow vs=%d, fs=%d", vs, fs);
+
+    int vreg = v.reg;
+    int freg = f.reg;
+    if (vshift) {
+        MOV_REG_TO_REG(vreg, d.reg);
+        SHR(vshift, d.reg);
+        vreg = d.reg;
+    }
+    if (fshift) {
+        MOV_REG_TO_REG(vreg, d.reg);
+        SHR(fshift, d.reg);
+        freg = d.reg;
+    }
+    MOV_REG_TO_REG(vreg, d.reg);
+    if (smulw) {
+        int flag_push_edx = 0;
+        int flag_reserve_edx = 0;
+        int temp_reg2 = -1;
+        int edx_offset_ebp = 0;
+        if(scratches.isUsed(EDX) == 1) {
+            if(d.reg != EDX) {
+                flag_push_edx = 1;
+                mCurSp = mCurSp - 4;
+                edx_offset_ebp = mCurSp;
+                MOV_REG_TO_MEM(EDX, edx_offset_ebp, EBP);
+                //PUSH(EDX);
+            }
+        }
+        else {
+            flag_reserve_edx = 1;
+            scratches.reserve(EDX);
+        }
+        if(scratches.isUsed(EAX)) {
+            if( freg == EAX || d.reg == EAX) {
+                MOVSX_REG_TO_REG(OpndSize_16, freg, freg);
+                if(freg == EAX)
+                    IMUL(d.reg);
+                else
+                    IMUL(freg);
+                SHL(16, EDX);
+                SHR(16, EAX);
+                MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                MOV_REG_TO_REG(EDX, d.reg);
+            }
+            else {
+                int eax_offset_ebp = 0;
+                if(scratches.countFreeRegs() > 0) {
+                    temp_reg2 = scratches.obtain();
+                    MOV_REG_TO_REG(EAX, temp_reg2);
+                }
+                else {
+                    mCurSp = mCurSp - 4;
+                    eax_offset_ebp = mCurSp;
+                    MOV_REG_TO_MEM(EAX, eax_offset_ebp, EBP);
+                    //PUSH(EAX);
+                }
+                MOV_REG_TO_REG(freg, EAX);
+                MOVSX_REG_TO_REG(OpndSize_16, EAX, EAX);
+                IMUL(d.reg);
+                SHL(16, EDX);
+                SHR(16, EAX);
+                MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                MOV_REG_TO_REG(EDX, d.reg);
+                if(temp_reg2 > -1) {
+                    MOV_REG_TO_REG(temp_reg2, EAX);
+                    scratches.recycle(temp_reg2);
+                }
+                else {
+                    MOV_MEM_TO_REG(eax_offset_ebp, EBP, EAX);
+                    //POP(EAX);
+                }
+            }
+        }
+        else {
+            MOV_REG_TO_REG(freg, EAX);
+            MOVSX_REG_TO_REG(OpndSize_16, EAX, EAX);
+            IMUL(d.reg);
+            SHL(16, EDX);
+            SHR(16, EAX);
+            MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+            MOV_REG_TO_REG(EDX, d.reg);
+        }
+        if(flag_push_edx == 1) {
+            MOV_MEM_TO_REG(edx_offset_ebp, EBP, EDX);
+            //POP(EDX);
+        }
+        if(flag_reserve_edx ==1)
+            scratches.recycle(EDX);
+    }
+    else {
+        if(xy == xyBB) {
+            MOVSX_REG_TO_REG(OpndSize_16, d.reg, d.reg);
+            MOVSX_REG_TO_REG(OpndSize_16, freg, freg);
+            IMUL(freg, d.reg);
+        }
+        else if(xy == xyTB) {
+            SHR(16, d.reg);
+            MOVSX_REG_TO_REG(OpndSize_16, d.reg, d.reg);
+            MOVSX_REG_TO_REG(OpndSize_16, freg, freg);
+            IMUL(freg, d.reg);
+        }
+        else if(xy == xyBT) {
+            MOVSX_REG_TO_REG(OpndSize_16, d.reg, d.reg);
+            SHR(16, freg);
+            MOVSX_REG_TO_REG(OpndSize_16, freg, freg);
+            IMUL(freg, d.reg);
+        }
+        else if(xy == xyTT) {
+            SHR(16, d.reg);
+            MOVSX_REG_TO_REG(OpndSize_16, d.reg, d.reg);
+            SHR(16, freg);
+            MOVSX_REG_TO_REG(OpndSize_16, freg, freg);
+            IMUL(freg, d.reg);
+        }
+    }
+
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = 0;
+    } else {
+        d.l = fs;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLX86Assembler::mul_factor_add(  component_t& d,
+                                       const integer_t& v,
+                                       const integer_t& f,
+                                       const component_t& a)
+{
+    // XXX: we could have special cases for 1 bit mul
+    Scratch scratches(registerFile());
+
+    int vs = v.size();
+    int fs = f.size();
+    int as = a.h;
+    int ms = vs+fs;
+
+    ALOGE_IF(ms>=32, "mul_factor_add overflow vs=%d, fs=%d, as=%d", vs, fs, as);
+
+    integer_t add(a.reg, a.h, a.flags, a.offset_ebp);
+
+
+    // 'a' is a component_t but it is guaranteed to have
+    // its high bits set to 0. However in the dithering case,
+    // we can't get away with truncating the potentially bad bits
+    // so extraction is needed.
+
+    if ((mDithering) && (a.size() < ms)) {
+        // we need to expand a
+        if (!(a.flags & CORRUPTIBLE)) {
+            // ... but it's not corruptible, so we need to pick a
+            // temporary register.
+            // Try to uses the destination register first (it's likely
+            // to be usable, unless it aliases an input).
+            if (d.reg!=a.reg && d.reg!=v.reg && d.reg!=f.reg) {
+                add.reg = d.reg;
+            } else {
+                add.reg = scratches.obtain();
+            }
+        }
+        expand(add, a, ms); // extracts and expands
+        as = ms;
+    }
+
+    if (ms == as) {
+        MOV_REG_TO_REG(v.reg, d.reg);
+        if (vs<16 && fs<16) {
+            MOVSX_REG_TO_REG(OpndSize_16, d.reg, d.reg);
+            MOVSX_REG_TO_REG(OpndSize_16, f.reg, f.reg);
+            IMUL(f.reg, d.reg);
+        }
+        else
+            IMUL(f.reg, d.reg);
+        ADD_REG_TO_REG(add.reg, d.reg);
+    } else {
+        //int temp = d.reg;
+        //if (temp == add.reg) {
+        //    // the mul will modify add.reg, we need an intermediary reg
+        //    if (v.flags & CORRUPTIBLE)      temp = v.reg;
+        //    else if (f.flags & CORRUPTIBLE) temp = f.reg;
+        //    else                            temp = scratches.obtain();
+        //}
+
+        // below d.reg may override "temp" result, so we use a new register
+        int temp_reg;
+        int v_offset_ebp = 0;
+        if(scratches.countFreeRegs() == 0) {
+            temp_reg = v.reg;
+            mCurSp = mCurSp - 4;
+            v_offset_ebp = mCurSp;
+            MOV_REG_TO_MEM(v.reg, v_offset_ebp, EBP);
+        }
+        else {
+            temp_reg = scratches.obtain();
+            MOV_REG_TO_REG(v.reg, temp_reg);
+        }
+        if (vs<16 && fs<16) {
+            MOVSX_REG_TO_REG(OpndSize_16, temp_reg, temp_reg);
+            MOVSX_REG_TO_REG(OpndSize_16, f.reg, f.reg);
+            IMUL(f.reg, temp_reg);
+        }
+        else
+            IMUL(f.reg, temp_reg);
+
+        if (ms>as) {
+            MOV_REG_TO_REG(add.reg, d.reg);
+            SHL(ms-as, d.reg);
+            ADD_REG_TO_REG(temp_reg, d.reg);
+        } else if (ms<as) {
+            // not sure if we should expand the mul instead?
+            MOV_REG_TO_REG(add.reg, d.reg);
+            SHL(as-ms, d.reg);
+            ADD_REG_TO_REG(temp_reg, d.reg);
+        }
+        if(temp_reg == v.reg)
+            MOV_MEM_TO_REG(v_offset_ebp, EBP, v.reg);
+        else
+            scratches.recycle(temp_reg);
+    }
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = a.l;
+    } else {
+        d.l = fs>a.l ? fs : a.l;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLX86Assembler::component_add(component_t& d,
+                                    const integer_t& dst, const integer_t& src)
+{
+    // here we're guaranteed that fragment.size() >= fb.size()
+    const int shift = src.size() - dst.size();
+    if (!shift) {
+        MOV_REG_TO_REG(src.reg, d.reg);
+        ADD_REG_TO_REG(dst.reg, d.reg);
+    } else {
+        MOV_REG_TO_REG(dst.reg, d.reg);
+        SHL(shift, d.reg);
+        ADD_REG_TO_REG(src.reg, d.reg);
+    }
+
+    d.h = src.size();
+    if (mDithering) {
+        d.l = 0;
+    } else {
+        d.l = shift;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLX86Assembler::component_sat(const component_t& v, const int temp_reg)
+{
+    const int32_t one = ((1<<v.size())-1)<<v.l;
+    MOV_IMM_TO_REG(one, temp_reg);
+    CMP_IMM_TO_REG(1<<v.h, v.reg);
+    CMOV_REG_TO_REG(Mnemonic_CMOVAE, temp_reg, v.reg);
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/x86/libenc/Android.mk b/libpixelflinger/codeflinger/x86/libenc/Android.mk
new file mode 100644
index 0000000..445de06
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/Android.mk
@@ -0,0 +1,30 @@
+#
+# Copyright (C) 2015 The Android-x86 Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+LOCAL_PATH := $(call my-dir)
+
+enc_src_files := \
+    dec_base.cpp \
+    enc_base.cpp \
+    enc_tabl.cpp \
+    enc_wrapper.cpp
+
+include $(CLEAR_VARS)
+LOCAL_SRC_FILES := $(enc_src_files)
+LOCAL_MODULE := libenc
+LOCAL_MODULE_TAGS := optional
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/libpixelflinger/codeflinger/x86/libenc/README.txt b/libpixelflinger/codeflinger/x86/libenc/README.txt
new file mode 100644
index 0000000..a2e73ec
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/README.txt
@@ -0,0 +1,21 @@
+Original source from Apache Harmony 5.0M15 (r991518 from 2010-09-01) at
+http://harmony.apache.org/.
+
+The following files are from drlvm/vm/port/src/encoder/ia32_em64t.
+
+    dec_base.cpp
+    dec_base.h
+    enc_base.cpp
+    enc_base.h
+    enc_defs.h
+    enc_prvt.h
+    enc_tabl.cpp
+    encoder.cpp
+    encoder.h
+    encoder.inl
+
+The following files are derived partially from the original Apache
+Harmony files.
+
+    enc_defs_ext.h -- derived from enc_defs.h
+    enc_wrapper.h  -- derived from encoder.h
diff --git a/libpixelflinger/codeflinger/x86/libenc/dec_base.cpp b/libpixelflinger/codeflinger/x86/libenc/dec_base.cpp
new file mode 100644
index 0000000..ea85d10
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/dec_base.cpp
@@ -0,0 +1,541 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+
+/**
+ * @file
+ * @brief Main decoding (disassembling) routines implementation.
+ */
+
+#include "dec_base.h"
+#include "enc_prvt.h"
+#include <stdio.h>
+//#include "open/common.h"
+
+bool DecoderBase::is_prefix(const unsigned char * bytes)
+{
+    unsigned char b0 = *bytes;
+    unsigned char b1 = *(bytes+1);
+    if (b0 == 0xF0) { // LOCK
+        return true;
+    }
+    if (b0==0xF2 || b0==0xF3) { // REPNZ/REPZ prefixes
+        if (b1 == 0x0F) {   // .... but may be a part of SIMD opcode
+            return false;
+        }
+        return true;
+    }
+    if (b0 == 0x2E || b0 == 0x36 || b0==0x3E || b0==0x26 || b0==0x64 || b0==0x3E) {
+        // branch hints, segment prefixes
+        return true;
+    }
+    if (b0==0x66) { // operand-size prefix
+        if (b1 == 0x0F) {   // .... but may be a part of SIMD opcode
+            return false;
+        }
+        return false; //XXX - currently considered as part of opcode//true;
+    }
+    if (b0==0x67) { // address size prefix
+        return true;
+    }
+    return false;
+}
+
+// Returns prefix count from 0 to 4, or ((unsigned int)-1) on error
+unsigned int DecoderBase::fill_prefs(const unsigned char * bytes, Inst * pinst)
+{
+    const unsigned char * my_bytes = bytes;
+
+    while( 1 )
+    {
+        unsigned char by1 = *my_bytes;
+        unsigned char by2 = *(my_bytes + 1);
+        Inst::PrefGroups where;
+
+        switch( by1 )
+        {
+        case InstPrefix_REPNE:
+        case InstPrefix_REP:
+        {
+            if( 0x0F == by2)
+            {
+                return pinst->prefc;
+            }
+        }
+        case InstPrefix_LOCK:
+        {
+            where = Inst::Group1;
+            break;
+        }
+        case InstPrefix_CS:
+        case InstPrefix_SS:
+        case InstPrefix_DS:
+        case InstPrefix_ES:
+        case InstPrefix_FS:
+        case InstPrefix_GS:
+//      case InstPrefix_HintTaken: the same as CS override
+//      case InstPrefix_HintNotTaken: the same as DS override
+        {
+            where = Inst::Group2;
+            break;
+        }
+        case InstPrefix_OpndSize:
+        {
+//NOTE:   prefix does not work for JMP Sz16, the opcode is 0x66 0xe9
+//        here 0x66 will be treated as prefix, try_mn will try to match the code starting at 0xe9
+//        it will match JMP Sz32 ...
+//HACK:   assume it is the last prefix, return any way
+            if( 0x0F == by2)
+            {
+                return pinst->prefc;
+            }
+            return pinst->prefc;
+            where = Inst::Group3;
+            break;
+        }
+        case InstPrefix_AddrSize:
+        {
+            where = Inst::Group4;
+            break;
+        }
+        default:
+        {
+            return pinst->prefc;
+        }
+        }
+        // Assertions are not allowed here.
+        // Error situations should result in returning error status
+        if (InstPrefix_Null != pinst->pref[where]) //only one prefix in each group
+            return (unsigned int)-1;
+
+        pinst->pref[where] = (InstPrefix)by1;
+
+        if (pinst->prefc >= 4) //no more than 4 prefixes
+            return (unsigned int)-1;
+
+        pinst->prefc++;
+        ++my_bytes;
+    }
+}
+
+
+
+unsigned DecoderBase::decode(const void * addr, Inst * pinst)
+{
+    Inst tmp;
+
+    //assert( *(unsigned char*)addr != 0x66);
+
+    const unsigned char * bytes = (unsigned char*)addr;
+
+    // Load up to 4 prefixes
+    // for each Mnemonic
+    unsigned int pref_count = fill_prefs(bytes, &tmp);
+
+    if (pref_count == (unsigned int)-1) // Wrong prefix sequence, or >4 prefixes
+        return 0; // Error
+
+    bytes += pref_count;
+
+    //  for each opcodedesc
+    //      if (raw_len == 0) memcmp(, raw_len)
+    //  else check the mixed state which is one of the following:
+    //      /digit /i /rw /rd /rb
+
+    bool found = false;
+    const unsigned char * saveBytes = bytes;
+    for (unsigned mn=1; mn<Mnemonic_Count; mn++) {
+        bytes = saveBytes;
+        found=try_mn((Mnemonic)mn, &bytes, &tmp);
+        if (found) {
+            tmp.mn = (Mnemonic)mn;
+            break;
+        }
+    }
+    if (!found) {
+        // Unknown opcode
+        return 0;
+    }
+    tmp.size = (unsigned)(bytes-(const unsigned char*)addr);
+    if (pinst) {
+        *pinst = tmp;
+    }
+    return tmp.size;
+}
+
+#ifdef _EM64T_
+#define EXTEND_REG(reg, flag)                        \
+    ((NULL == rex || 0 == rex->flag) ? reg : (reg + 8))
+#else
+#define EXTEND_REG(reg, flag) (reg)
+#endif
+
+//don't know the use of rex, seems not used when _EM64T_ is not enabled
+bool DecoderBase::decode_aux(const EncoderBase::OpcodeDesc& odesc, unsigned aux,
+    const unsigned char ** pbuf, Inst * pinst
+#ifdef _EM64T_
+    , const Rex UNREF *rex
+#endif
+    )
+{
+    OpcodeByteKind kind = (OpcodeByteKind)(aux & OpcodeByteKind_KindMask);
+    unsigned byte = (aux & OpcodeByteKind_OpcodeMask);
+    unsigned data_byte = **pbuf;
+    EncoderBase::Operand& opnd = pinst->operands[pinst->argc];
+    const EncoderBase::OpndDesc& opndDesc = odesc.opnds[pinst->argc];
+
+    switch (kind) {
+    case OpcodeByteKind_SlashR:
+        {
+            RegName reg;
+            OpndKind okind;
+            const ModRM& modrm = *(ModRM*)*pbuf;
+            if (opndDesc.kind & OpndKind_Mem) { // 1st operand is memory
+#ifdef _EM64T_
+                decodeModRM(odesc, pbuf, pinst, rex);
+#else
+                decodeModRM(odesc, pbuf, pinst);
+#endif
+                ++pinst->argc;
+                const EncoderBase::OpndDesc& opndDesc2 = odesc.opnds[pinst->argc];
+                okind = ((opndDesc2.kind & OpndKind_XMMReg) || opndDesc2.size==OpndSize_64) ? OpndKind_XMMReg : OpndKind_GPReg;
+                EncoderBase::Operand& regOpnd = pinst->operands[pinst->argc];
+                reg = getRegName(okind, opndDesc2.size, EXTEND_REG(modrm.reg, r));
+                regOpnd = EncoderBase::Operand(reg);
+            } else {                            // 2nd operand is memory
+                okind = ((opndDesc.kind & OpndKind_XMMReg) || opndDesc.size==OpndSize_64) ? OpndKind_XMMReg : OpndKind_GPReg;
+                EncoderBase::Operand& regOpnd = pinst->operands[pinst->argc];
+                reg = getRegName(okind, opndDesc.size, EXTEND_REG(modrm.reg, r));
+                regOpnd = EncoderBase::Operand(reg);
+                ++pinst->argc;
+#ifdef _EM64T_
+                decodeModRM(odesc, pbuf, pinst, rex);
+#else
+                decodeModRM(odesc, pbuf, pinst);
+#endif
+            }
+            ++pinst->argc;
+        }
+        return true;
+    case OpcodeByteKind_rb:
+    case OpcodeByteKind_rw:
+    case OpcodeByteKind_rd:
+        {
+            // Gregory -
+            // Here we don't parse register because for current needs
+            // disassembler doesn't require to parse all operands
+            unsigned regid = data_byte - byte;
+            if (regid>7) {
+                return false;
+            }
+            OpndSize opnd_size;
+            switch(kind)
+            {
+            case OpcodeByteKind_rb:
+            {
+                opnd_size = OpndSize_8;
+                break;
+            }
+            case OpcodeByteKind_rw:
+            {
+                opnd_size = OpndSize_16;
+                break;
+            }
+            case OpcodeByteKind_rd:
+            {
+                opnd_size = OpndSize_32;
+                break;
+            }
+            default:
+                opnd_size = OpndSize_32;  // so there is no compiler warning
+                assert( false );
+            }
+            opnd = EncoderBase::Operand( getRegName(OpndKind_GPReg, opnd_size, regid) );
+
+            ++pinst->argc;
+            ++*pbuf;
+            return true;
+        }
+    case OpcodeByteKind_cb:
+        {
+        char offset = *(char*)*pbuf;
+        *pbuf += 1;
+        opnd = EncoderBase::Operand(offset);
+        ++pinst->argc;
+        //pinst->direct_addr = (void*)(pinst->offset + *pbuf);
+        }
+        return true;
+    case OpcodeByteKind_cw:
+        // not an error, but not expected in current env
+        // Android x86
+        {
+        short offset = *(short*)*pbuf;
+        *pbuf += 2;
+        opnd = EncoderBase::Operand(offset);
+        ++pinst->argc;
+        }
+        return true;
+        //return false;
+    case OpcodeByteKind_cd:
+        {
+        int offset = *(int*)*pbuf;
+        *pbuf += 4;
+        opnd = EncoderBase::Operand(offset);
+        ++pinst->argc;
+        }
+        return true;
+    case OpcodeByteKind_SlashNum:
+        {
+        const ModRM& modrm = *(ModRM*)*pbuf;
+        if (modrm.reg != byte) {
+            return false;
+        }
+        decodeModRM(odesc, pbuf, pinst
+#ifdef _EM64T_
+                        , rex
+#endif
+                        );
+        ++pinst->argc;
+        }
+        return true;
+    case OpcodeByteKind_ib:
+        {
+        char ival = *(char*)*pbuf;
+        opnd = EncoderBase::Operand(ival);
+        ++pinst->argc;
+        *pbuf += 1;
+        }
+        return true;
+    case OpcodeByteKind_iw:
+        {
+        short ival = *(short*)*pbuf;
+        opnd = EncoderBase::Operand(ival);
+        ++pinst->argc;
+        *pbuf += 2;
+        }
+        return true;
+    case OpcodeByteKind_id:
+        {
+        int ival = *(int*)*pbuf;
+        opnd = EncoderBase::Operand(ival);
+        ++pinst->argc;
+        *pbuf += 4;
+        }
+        return true;
+#ifdef _EM64T_
+    case OpcodeByteKind_io:
+        {
+        long long int ival = *(long long int*)*pbuf;
+        opnd = EncoderBase::Operand(OpndSize_64, ival);
+        ++pinst->argc;
+        *pbuf += 8;
+        }
+        return true;
+#endif
+    case OpcodeByteKind_plus_i:
+        {
+            unsigned regid = data_byte - byte;
+            if (regid>7) {
+                return false;
+            }
+            ++*pbuf;
+            return true;
+        }
+    case OpcodeByteKind_ZeroOpcodeByte: // cant be here
+        return false;
+    default:
+        // unknown kind ? how comes ?
+        break;
+    }
+    return false;
+}
+
+bool DecoderBase::try_mn(Mnemonic mn, const unsigned char ** pbuf, Inst * pinst) {
+    const unsigned char * save_pbuf = *pbuf;
+    EncoderBase::OpcodeDesc * opcodes = EncoderBase::opcodes[mn];
+
+    for (unsigned i=0; !opcodes[i].last; i++) {
+        const EncoderBase::OpcodeDesc& odesc = opcodes[i];
+        char *opcode_ptr = const_cast<char *>(odesc.opcode);
+        int opcode_len = odesc.opcode_len;
+#ifdef _EM64T_
+        Rex *prex = NULL;
+        Rex rex;
+#endif
+
+        *pbuf = save_pbuf;
+#ifdef _EM64T_
+        // Match REX prefixes
+        unsigned char rex_byte = (*pbuf)[0];
+        if ((rex_byte & 0xf0) == 0x40)
+        {
+            if ((rex_byte & 0x08) != 0)
+            {
+                // Have REX.W
+                if (opcode_len > 0 && opcode_ptr[0] == 0x48)
+                {
+                    // Have REX.W in opcode. All mnemonics that allow
+                    // REX.W have to have specified it in opcode,
+                    // otherwise it is not allowed
+                    rex = *(Rex *)*pbuf;
+                    prex = &rex;
+                    (*pbuf)++;
+                    opcode_ptr++;
+                    opcode_len--;
+                }
+            }
+            else
+            {
+                // No REX.W, so it doesn't have to be in opcode. We
+                // have REX.B, REX.X, REX.R or their combination, but
+                // not in opcode, they may extend any part of the
+                // instruction
+                rex = *(Rex *)*pbuf;
+                prex = &rex;
+                (*pbuf)++;
+            }
+        }
+#endif
+        if (opcode_len != 0) {
+            if (memcmp(*pbuf, opcode_ptr, opcode_len)) {
+                continue;
+            }
+            *pbuf += opcode_len;
+        }
+        if (odesc.aux0 != 0) {
+
+            if (!decode_aux(odesc, odesc.aux0, pbuf, pinst
+#ifdef _EM64T_
+                            , prex
+#endif
+                            )) {
+                continue;
+            }
+            if (odesc.aux1 != 0) {
+                if (!decode_aux(odesc, odesc.aux1, pbuf, pinst
+#ifdef _EM64T_
+                            , prex
+#endif
+                            )) {
+                    continue;
+                }
+            }
+            pinst->odesc = &opcodes[i];
+            return true;
+        }
+        else {
+            // Can't have empty opcode
+            assert(opcode_len != 0);
+            pinst->odesc = &opcodes[i];
+            return true;
+        }
+    }
+    return false;
+}
+
+bool DecoderBase::decodeModRM(const EncoderBase::OpcodeDesc& odesc,
+    const unsigned char ** pbuf, Inst * pinst
+#ifdef _EM64T_
+    , const Rex *rex
+#endif
+    )
+{
+    EncoderBase::Operand& opnd = pinst->operands[pinst->argc];
+    const EncoderBase::OpndDesc& opndDesc = odesc.opnds[pinst->argc];
+
+    //XXX debug ///assert(0x66 != *(*pbuf-2));
+    const ModRM& modrm = *(ModRM*)*pbuf;
+    *pbuf += 1;
+
+    RegName base = RegName_Null;
+    RegName index = RegName_Null;
+    int disp = 0;
+    unsigned scale = 0;
+
+    // On x86_64 all mnemonics that allow REX.W have REX.W in opcode.
+    // Therefore REX.W is simply ignored, and opndDesc.size is used
+
+    if (modrm.mod == 3) {
+        // we have only modrm. no sib, no disp.
+        // Android x86: Use XMMReg for 64b operand.
+        OpndKind okind = ((opndDesc.kind & OpndKind_XMMReg) || opndDesc.size == OpndSize_64) ? OpndKind_XMMReg : OpndKind_GPReg;
+        RegName reg = getRegName(okind, opndDesc.size, EXTEND_REG(modrm.rm, b));
+        opnd = EncoderBase::Operand(reg);
+        return true;
+    }
+    //Android x86: m16, m32, m64: mean a byte[word|doubleword] operand in memory
+    //base and index should be 32 bits!!!
+    const SIB& sib = *(SIB*)*pbuf;
+    // check whether we have a sib
+    if (modrm.rm == 4) {
+        // yes, we have SIB
+        *pbuf += 1;
+        if (sib.index != 4) {
+            index = getRegName(OpndKind_GPReg, OpndSize_32, EXTEND_REG(sib.index, x)); //Android x86: OpndDesc.size
+        } else {
+            // (sib.index == 4) => no index
+            //%esp can't be sib.index
+        }
+
+        // scale = sib.scale == 0 ? 0 : (1<<sib.scale);
+        // scale = (1<<sib.scale);
+        scale = (index == RegName_Null) ? 0 : (1<<sib.scale);
+
+        if (sib.base != 5 || modrm.mod != 0) {
+            base = getRegName(OpndKind_GPReg, OpndSize_32, EXTEND_REG(sib.base, b)); //Android x86: OpndDesc.size
+        } else {
+            // (sib.base == 5 && modrm.mod == 0) => no base
+        }
+    }
+    else {
+        if (modrm.mod != 0 || modrm.rm != 5) {
+            base = getRegName(OpndKind_GPReg, OpndSize_32, EXTEND_REG(modrm.rm, b)); //Android x86: OpndDesc.size
+        }
+        else {
+            // mod=0 && rm == 5 => only disp32
+        }
+    }
+
+    //update disp and pbuf
+    if (modrm.mod == 2) {
+        // have disp32
+        disp = *(int*)*pbuf;
+        *pbuf += 4;
+    }
+    else if (modrm.mod == 1) {
+        // have disp8
+        disp = *(char*)*pbuf;
+        *pbuf += 1;
+    }
+    else {
+        assert(modrm.mod == 0);
+        if (modrm.rm == 5) {
+            // have disp32 w/o sib
+            disp = *(int*)*pbuf;
+            *pbuf += 4;
+        }
+        else if (modrm.rm == 4 && sib.base == 5) {
+            // have disp32 with SI in sib
+            disp = *(int*)*pbuf;
+            *pbuf += 4;
+        }
+    }
+    opnd = EncoderBase::Operand(opndDesc.size, base, index, scale, disp);
+    return true;
+}
diff --git a/libpixelflinger/codeflinger/x86/libenc/dec_base.h b/libpixelflinger/codeflinger/x86/libenc/dec_base.h
new file mode 100644
index 0000000..f1fa123
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/dec_base.h
@@ -0,0 +1,135 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+
+/**
+ * @file
+ * @brief Main decoding (disassembling) routines and structures.
+ *
+ * @note Quick and rough implementation, subject for a change.
+ */
+
+#ifndef __DEC_BASE_H_INCLUDED__
+#define __DEC_BASE_H_INCLUDED__
+
+
+#include "enc_base.h"
+#include "enc_prvt.h"
+
+#ifdef ENCODER_ISOLATE
+using namespace enc_ia32;
+#endif
+
+#define IF_CONDITIONAL  (0x00000000)
+#define IF_SYMMETRIC    (0x00000000)
+#define IF_BRANCH       (0x00000000)
+
+struct Inst {
+    Inst() {
+        mn = Mnemonic_Null;
+        prefc = 0;
+        size = 0;
+        flags = 0;
+        //offset = 0;
+        //direct_addr = NULL;
+        argc = 0;
+        for(int i = 0; i < 4; ++i)
+        {
+            pref[i] = InstPrefix_Null;
+        }
+    }
+    /**
+     * Mnemonic of the instruction.s
+     */
+    Mnemonic mn;
+    /**
+     * Enumerating of indexes in the pref array.
+     */
+    enum PrefGroups
+    {
+        Group1 = 0,
+        Group2,
+        Group3,
+        Group4
+    };
+    /**
+     * Number of prefixes (1 byte each).
+     */
+    unsigned int prefc;
+    /**
+     * Instruction prefixes. Prefix should be placed here according to its group.
+     */
+    InstPrefix pref[4];
+    /**
+     * Size, in bytes, of the instruction.
+     */
+    unsigned size;
+    /**
+     * Flags of the instruction.
+     * @see MF_
+     */
+    unsigned flags;
+    /**
+     * An offset of target address, in case of 'CALL offset',
+     * 'JMP/Jcc offset'.
+     */
+    //int      offset;
+    /**
+     * Direct address of the target (on Intel64/IA-32 is 'instruction IP' +
+     * 'instruction length' + offset).
+     */
+    //void *   direct_addr;
+    /**
+     * Number of arguments of the instruction.
+     */
+    unsigned argc;
+    //
+    EncoderBase::Operand operands[3];
+    //
+    const EncoderBase::OpcodeDesc * odesc;
+};
+
+inline bool is_jcc(Mnemonic mn)
+{
+    return Mnemonic_JO <= mn && mn<=Mnemonic_JG;
+}
+
+class DecoderBase {
+public:
+    static unsigned decode(const void * addr, Inst * pinst);
+private:
+    static bool decodeModRM(const EncoderBase::OpcodeDesc& odesc,
+        const unsigned char ** pbuf, Inst * pinst
+#ifdef _EM64T_
+        , const Rex *rex
+#endif
+        );
+    static bool decode_aux(const EncoderBase::OpcodeDesc& odesc,
+        unsigned aux, const unsigned char ** pbuf,
+        Inst * pinst
+#ifdef _EM64T_
+        , const Rex *rex
+#endif
+        );
+    static bool try_mn(Mnemonic mn, const unsigned char ** pbuf, Inst * pinst);
+    static unsigned int fill_prefs( const unsigned char * bytes, Inst * pinst);
+    static bool is_prefix(const unsigned char * bytes);
+};
+
+#endif  // ~ __DEC_BASE_H_INCLUDED__
diff --git a/libpixelflinger/codeflinger/x86/libenc/enc_base.cpp b/libpixelflinger/codeflinger/x86/libenc/enc_base.cpp
new file mode 100644
index 0000000..0562ce8
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/enc_base.cpp
@@ -0,0 +1,1137 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+#include "enc_base.h"
+//#include <climits>
+#include <string.h>
+#define USE_ENCODER_DEFINES
+#include "enc_prvt.h"
+#include <stdio.h>
+
+//#define JET_PROTO
+
+#ifdef JET_PROTO
+#include "dec_base.h"
+#include "jvmti_dasm.h"
+#endif
+
+ENCODER_NAMESPACE_START
+
+/**
+ * @file
+ * @brief Main encoding routines and structures.
+ */
+
+#ifndef _WIN32
+    #define strcmpi strcasecmp
+#endif
+
+int EncoderBase::dummy = EncoderBase::buildTable();
+
+const unsigned char EncoderBase::size_hash[OpndSize_64+1] = {
+    //
+    0xFF,   // OpndSize_Null        = 0,
+    3,              // OpndSize_8           = 0x1,
+    2,              // OpndSize_16          = 0x2,
+    0xFF,   // 0x3
+    1,              // OpndSize_32          = 0x4,
+    0xFF,   // 0x5
+    0xFF,   // 0x6
+    0xFF,   // 0x7
+    0,              // OpndSize_64          = 0x8,
+    //
+};
+
+const unsigned char EncoderBase::kind_hash[OpndKind_Mem+1] = {
+    //
+    //gp reg                -> 000 = 0
+    //memory                -> 001 = 1
+    //immediate             -> 010 = 2
+    //xmm reg               -> 011 = 3
+    //segment regs  -> 100 = 4
+    //fp reg                -> 101 = 5
+    //mmx reg               -> 110 = 6
+    //
+    0xFF,                          // 0    OpndKind_Null=0,
+    0<<2,                          // 1    OpndKind_GPReg =
+                                   //           OpndKind_MinRegKind=0x1,
+    4<<2,                          // 2    OpndKind_SReg=0x2,
+
+#ifdef _HAVE_MMX_
+    6<<2,                          // 3
+#else
+    0xFF,                          // 3
+#endif
+
+    5<<2,                          // 4    OpndKind_FPReg=0x4,
+    0xFF, 0xFF, 0xFF,              // 5, 6, 7
+    3<<2,                                   //      OpndKind_XMMReg=0x8,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 9, 0xA, 0xB, 0xC, 0xD,
+                                              // 0xE, 0xF
+    0xFF,                          // OpndKind_MaxRegKind =
+                                   // OpndKind_StatusReg =
+                                   // OpndKind_OtherReg=0x10,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x11-0x18
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,               // 0x19-0x1F
+    2<<2,                                   // OpndKind_Immediate=0x20,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x21-0x28
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x29-0x30
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // 0x31-0x38
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,               // 0x39-0x3F
+    1<<2,                                   // OpndKind_Memory=0x40
+};
+
+char * EncoderBase::curRelOpnd[3];
+
+char* EncoderBase::encode_aux(char* stream, unsigned aux,
+                              const Operands& opnds, const OpcodeDesc * odesc,
+                              unsigned * pargsCount, Rex * prex)
+{
+    const unsigned byte = aux;
+    OpcodeByteKind kind = (OpcodeByteKind)(byte & OpcodeByteKind_KindMask);
+    // The '>>' here is to force the switch to be table-based) instead of
+    // set of CMP+Jcc.
+    if (*pargsCount >= COUNTOF(opnds)) {
+        assert(false);
+        return stream;
+    }
+    switch(kind>>8) {
+    case OpcodeByteKind_SlashR>>8:
+        // /r - Indicates that the ModR/M byte of the instruction contains
+        // both a register operand and an r/m operand.
+        {
+        assert(opnds.count() > 1);
+    // not true anymore for MOVQ xmm<->r
+        //assert((odesc->opnds[0].kind & OpndKind_Mem) ||
+        //       (odesc->opnds[1].kind & OpndKind_Mem));
+        unsigned memidx = odesc->opnds[0].kind & OpndKind_Mem ? 0 : 1;
+        unsigned regidx = memidx == 0 ? 1 : 0;
+        memidx += *pargsCount;
+        regidx += *pargsCount;
+        ModRM& modrm = *(ModRM*)stream;
+        if (memidx >= COUNTOF(opnds) || regidx >= COUNTOF(opnds)) {
+            assert(false);
+            break;
+        }
+        if (opnds[memidx].is_mem()) {
+            stream = encodeModRM(stream, opnds, memidx, odesc, prex);
+        }
+        else {
+            modrm.mod = 3; // 11
+            modrm.rm = getHWRegIndex(opnds[memidx].reg());
+#ifdef _EM64T_
+            if (opnds[memidx].need_rex() && needs_rex_r(opnds[memidx].reg())) {
+                prex->b = 1;
+            }
+#endif
+            ++stream;
+        }
+        modrm.reg = getHWRegIndex(opnds[regidx].reg());
+#ifdef _EM64T_
+        if (opnds[regidx].need_rex() && needs_rex_r(opnds[regidx].reg())) {
+            prex->r = 1;
+        }
+#endif
+        *pargsCount += 2;
+        }
+        break;
+    case OpcodeByteKind_SlashNum>>8:
+        //  /digit - A digit between 0 and 7 indicates that the
+        //  ModR/M byte of the instruction uses only the r/m
+        //  (register or memory) operand. The reg field contains
+        //  the digit that provides an extension to the instruction's
+        //  opcode.
+        {
+        const unsigned lowByte = (byte & OpcodeByteKind_OpcodeMask);
+        assert(lowByte <= 7);
+        ModRM& modrm = *(ModRM*)stream;
+        unsigned idx = *pargsCount;
+        assert(opnds[idx].is_mem() || opnds[idx].is_reg());
+        if (opnds[idx].is_mem()) {
+            stream = encodeModRM(stream, opnds, idx, odesc, prex);
+        }
+        else {
+            modrm.mod = 3; // 11
+            modrm.rm = getHWRegIndex(opnds[idx].reg());
+#ifdef _EM64T_
+            if (opnds[idx].need_rex() && needs_rex_r(opnds[idx].reg())) {
+                prex->b = 1;
+            }
+#endif
+            ++stream;
+        }
+        modrm.reg = (char)lowByte;
+        *pargsCount += 1;
+        }
+        break;
+    case OpcodeByteKind_plus_i>>8:
+        //  +i - A number used in floating-point instructions when one
+        //  of the operands is ST(i) from the FPU register stack. The
+        //  number i (which can range from 0 to 7) is added to the
+        //  hexadecimal byte given at the left of the plus sign to form
+        //  a single opcode byte.
+        {
+            unsigned idx = *pargsCount;
+            const unsigned lowByte = (byte & OpcodeByteKind_OpcodeMask);
+            *stream = (char)lowByte + getHWRegIndex(opnds[idx].reg());
+            ++stream;
+            *pargsCount += 1;
+        }
+        break;
+    case OpcodeByteKind_ib>>8:
+    case OpcodeByteKind_iw>>8:
+    case OpcodeByteKind_id>>8:
+#ifdef _EM64T_
+    case OpcodeByteKind_io>>8:
+#endif //_EM64T_
+        //  ib, iw, id - A 1-byte (ib), 2-byte (iw), or 4-byte (id)
+        //  immediate operand to the instruction that follows the
+        //  opcode, ModR/M bytes or scale-indexing bytes. The opcode
+        //  determines if the operand is a signed value. All words
+        //  and double words are given with the low-order byte first.
+        {
+            unsigned idx = *pargsCount;
+            *pargsCount += 1;
+            assert(opnds[idx].is_imm());
+            if (kind == OpcodeByteKind_ib) {
+                *(unsigned char*)stream = (unsigned char)opnds[idx].imm();
+                curRelOpnd[idx] = stream;
+                stream += 1;
+            }
+            else if (kind == OpcodeByteKind_iw) {
+                *(unsigned short*)stream = (unsigned short)opnds[idx].imm();
+                curRelOpnd[idx] = stream;
+                stream += 2;
+            }
+            else if (kind == OpcodeByteKind_id) {
+                *(unsigned*)stream = (unsigned)opnds[idx].imm();
+                curRelOpnd[idx] = stream;
+                stream += 4;
+            }
+#ifdef _EM64T_
+            else {
+                assert(kind == OpcodeByteKind_io);
+                *(long long*)stream = (long long)opnds[idx].imm();
+                curRelOpnd[idx] = stream;
+                stream += 8;
+            }
+#else
+            else {
+                assert(false);
+            }
+#endif
+        }
+        break;
+    case OpcodeByteKind_cb>>8:
+        assert(opnds[*pargsCount].is_imm());
+        *(unsigned char*)stream = (unsigned char)opnds[*pargsCount].imm();
+        curRelOpnd[*pargsCount]= stream;
+        stream += 1;
+        *pargsCount += 1;
+        break;
+    case OpcodeByteKind_cw>>8:
+        assert(opnds[*pargsCount].is_imm());
+        *(unsigned short*)stream = (unsigned short)opnds[*pargsCount].imm();
+        curRelOpnd[*pargsCount]= stream;
+        stream += 2;
+        *pargsCount += 1;
+        break;
+    case OpcodeByteKind_cd>>8:
+        assert(opnds[*pargsCount].is_imm());
+        *(unsigned*)stream = (unsigned)opnds[*pargsCount].imm();
+        curRelOpnd[*pargsCount]= stream;
+        stream += 4;
+        *pargsCount += 1;
+        break;
+    //OpcodeByteKind_cp                             = 0x0B00,
+    //OpcodeByteKind_co                             = 0x0C00,
+    //OpcodeByteKind_ct                             = 0x0D00,
+    case OpcodeByteKind_rb>>8:
+    case OpcodeByteKind_rw>>8:
+    case OpcodeByteKind_rd>>8:
+        //  +rb, +rw, +rd - A register code, from 0 through 7,
+        //  added to the hexadecimal byte given at the left of
+        //  the plus sign to form a single opcode byte.
+        assert(opnds.count() > 0);
+        assert(opnds[*pargsCount].is_reg());
+        {
+        const unsigned lowByte = (byte & OpcodeByteKind_OpcodeMask);
+        *(unsigned char*)stream = (unsigned char)lowByte +
+                                   getHWRegIndex(opnds[*pargsCount].reg());
+#ifdef _EM64T_
+        if (opnds[*pargsCount].need_rex() && needs_rex_r(opnds[*pargsCount].reg())) {
+        prex->b = 1;
+        }
+#endif
+        ++stream;
+        *pargsCount += 1;
+        }
+        break;
+    default:
+        assert(false);
+        break;
+    }
+    return stream;
+}
+
+char * EncoderBase::encode(char * stream, Mnemonic mn, const Operands& opnds)
+{
+#ifdef _DEBUG
+    if (opnds.count() > 0) {
+        if (opnds[0].is_mem()) {
+            assert(getRegKind(opnds[0].base()) != OpndKind_SReg);
+        }
+        else if (opnds.count() >1 && opnds[1].is_mem()) {
+            assert(getRegKind(opnds[1].base()) != OpndKind_SReg);
+        }
+    }
+#endif
+
+#ifdef JET_PROTO
+    char* saveStream = stream;
+#endif
+
+    const OpcodeDesc * odesc = lookup(mn, opnds);
+#if !defined(_EM64T_)
+    bool copy_opcode = true;
+    Rex *prex = NULL;
+#else
+    // We need rex if
+    //  either of registers used as operand or address form is new extended register
+    //  it's explicitly specified by opcode
+    // So, if we don't have REX in opcode but need_rex, then set rex here
+    // otherwise, wait until opcode is set, and then update REX
+
+    bool copy_opcode = true;
+    unsigned char _1st = odesc->opcode[0];
+
+    Rex *prex = (Rex*)stream;
+    if (opnds.need_rex() &&
+        ((_1st == 0x66) || (_1st == 0xF2 || _1st == 0xF3) && odesc->opcode[1] == 0x0F)) {
+        // Special processing
+        //
+        copy_opcode = false;
+        //
+        *(unsigned char*)stream = _1st;
+        ++stream;
+        //
+        prex = (Rex*)stream;
+        prex->dummy = 4;
+        prex->w = 0;
+        prex->b = 0;
+        prex->x = 0;
+        prex->r = 0;
+        ++stream;
+        //
+        memcpy(stream, &odesc->opcode[1], odesc->opcode_len-1);
+        stream += odesc->opcode_len-1;
+    }
+    else if (_1st != 0x48 && opnds.need_rex()) {
+        prex = (Rex*)stream;
+        prex->dummy = 4;
+        prex->w = 0;
+        prex->b = 0;
+        prex->x = 0;
+        prex->r = 0;
+        ++stream;
+    }
+#endif  // ifndef EM64T
+
+    if (copy_opcode) {
+        if (odesc->opcode_len==1) {
+            unsigned char *dest = (unsigned char *) (stream);
+            unsigned char *src = (unsigned char *) (& (odesc->opcode));
+            *dest = *src;
+        }
+        else if (odesc->opcode_len==2) {
+            short *dest = (short *) (stream);
+            void *ptr = (void *) (& (odesc->opcode));
+            short *src = (short *) (ptr);
+            *dest = *src;
+        }
+        else if (odesc->opcode_len==3) {
+            unsigned short *dest = (unsigned short *) (stream);
+            void *ptr = (void *) (& (odesc->opcode));
+            unsigned short *src = (unsigned short *) (ptr);
+            *dest = *src;
+
+            //Now handle the last part
+            unsigned char *dest2 = (unsigned char *) (stream + 2);
+            *dest2 = odesc->opcode[2];
+        }
+        else if (odesc->opcode_len==4) {
+            unsigned int *dest = (unsigned int *) (stream);
+            void *ptr = (void *) (& (odesc->opcode));
+            unsigned int *src = (unsigned int *) (ptr);
+            *dest = *src;
+        }
+        stream += odesc->opcode_len;
+    }
+
+    unsigned argsCount = odesc->first_opnd;
+
+    if (odesc->aux0) {
+        stream = encode_aux(stream, odesc->aux0, opnds, odesc, &argsCount, prex);
+        if (odesc->aux1) {
+            stream = encode_aux(stream, odesc->aux1, opnds, odesc, &argsCount, prex);
+        }
+    }
+#ifdef JET_PROTO
+    //saveStream
+    Inst inst;
+    unsigned len = DecoderBase::decode(saveStream, &inst);
+    assert(inst.mn == mn);
+    assert(len == (unsigned)(stream-saveStream));
+    if (mn == Mnemonic_CALL || mn == Mnemonic_JMP ||
+        Mnemonic_RET == mn ||
+        (Mnemonic_JO<=mn && mn<=Mnemonic_JG)) {
+        assert(inst.argc == opnds.count());
+
+        InstructionDisassembler idi(saveStream);
+
+        for (unsigned i=0; i<inst.argc; i++) {
+            const EncoderBase::Operand& original = opnds[i];
+            const EncoderBase::Operand& decoded = inst.operands[i];
+            assert(original.kind() == decoded.kind());
+            assert(original.size() == decoded.size());
+            if (original.is_imm()) {
+                assert(original.imm() == decoded.imm());
+                assert(idi.get_opnd(0).kind == InstructionDisassembler::Kind_Imm);
+                if (mn == Mnemonic_CALL) {
+                    assert(idi.get_type() == InstructionDisassembler::RELATIVE_CALL);
+                }
+                else if (mn == Mnemonic_JMP) {
+                    assert(idi.get_type() == InstructionDisassembler::RELATIVE_JUMP);
+                }
+                else if (mn == Mnemonic_RET) {
+                    assert(idi.get_type() == InstructionDisassembler::RET);
+                }
+                else {
+                    assert(idi.get_type() == InstructionDisassembler::RELATIVE_COND_JUMP);
+                }
+            }
+            else if (original.is_mem()) {
+                assert(original.base() == decoded.base());
+                assert(original.index() == decoded.index());
+                assert(original.scale() == decoded.scale());
+                assert(original.disp() == decoded.disp());
+                assert(idi.get_opnd(0).kind == InstructionDisassembler::Kind_Mem);
+                if (mn == Mnemonic_CALL) {
+                    assert(idi.get_type() == InstructionDisassembler::INDIRECT_CALL);
+                }
+                else if (mn == Mnemonic_JMP) {
+                    assert(idi.get_type() == InstructionDisassembler::INDIRECT_JUMP);
+                }
+                else {
+                    assert(false);
+                }
+            }
+            else {
+                assert(original.is_reg());
+                assert(original.reg() == decoded.reg());
+                assert(idi.get_opnd(0).kind == InstructionDisassembler::Kind_Reg);
+                if (mn == Mnemonic_CALL) {
+                    assert(idi.get_type() == InstructionDisassembler::INDIRECT_CALL);
+                }
+                else if (mn == Mnemonic_JMP) {
+                    assert(idi.get_type() == InstructionDisassembler::INDIRECT_JUMP);
+                }
+                else {
+                    assert(false);
+                }
+            }
+        }
+
+        Inst inst2;
+        len = DecoderBase::decode(saveStream, &inst2);
+    }
+
+ //   if(idi.get_length_with_prefix() != (int)len) {
+	//__asm { int 3 };
+ //   }
+#endif
+
+    return stream;
+}
+
+char* EncoderBase::encodeModRM(char* stream, const Operands& opnds,
+                               unsigned idx, const OpcodeDesc * odesc,
+                               Rex * prex)
+{
+    const Operand& op = opnds[idx];
+    assert(op.is_mem());
+    assert(idx < COUNTOF(curRelOpnd));
+    ModRM& modrm = *(ModRM*)stream;
+    ++stream;
+    SIB& sib = *(SIB*)stream;
+
+    // we need SIB if
+    //      we have index & scale (nb: having index w/o base and w/o scale
+    //      treated as error)
+    //      the base is EBP w/o disp, BUT let's use a fake disp8
+    //      the base is ESP (nb: cant have ESP as index)
+
+    RegName base = op.base();
+    // only disp ?..
+    if (base == RegName_Null && op.index() == RegName_Null) {
+        assert(op.scale() == 0); // 'scale!=0' has no meaning without index
+        // ... yes - only have disp
+        // On EM64T, the simply [disp] addressing means 'RIP-based' one -
+        // must have to use SIB to encode 'DS: based'
+#ifdef _EM64T_
+        modrm.mod = 0;  // 00 - ..
+        modrm.rm = 4;   // 100 - have SIB
+
+        sib.base = 5;   // 101 - none
+        sib.index = 4;  // 100 - none
+        sib.scale = 0;  //
+        ++stream; // bypass SIB
+#else
+        // ignore disp_fits8, always use disp32.
+        modrm.mod = 0;
+        modrm.rm = 5;
+#endif
+        *(unsigned*)stream = (unsigned)op.disp();
+        curRelOpnd[idx]= stream;
+        stream += 4;
+        return stream;
+    }
+
+    //climits: error when targeting compal
+#define CHAR_MIN -127
+#define CHAR_MAX 127
+    const bool disp_fits8 = CHAR_MIN <= op.disp() && op.disp() <= CHAR_MAX;
+    /*&& op.base() != RegName_Null - just checked above*/
+    if (op.index() == RegName_Null && getHWRegIndex(op.base()) != getHWRegIndex(REG_STACK)) {
+        assert(op.scale() == 0); // 'scale!=0' has no meaning without index
+        // ... luckily no SIB, only base and may be a disp
+
+        // EBP base is a special case. Need to use [EBP] + disp8 form
+        if (op.disp() == 0  && getHWRegIndex(op.base()) != getHWRegIndex(RegName_EBP)) {
+            modrm.mod = 0; // mod=00, no disp et all
+        }
+        else if (disp_fits8) {
+            modrm.mod = 1; // mod=01, use disp8
+            *(unsigned char*)stream = (unsigned char)op.disp();
+            curRelOpnd[idx]= stream;
+            ++stream;
+        }
+        else {
+            modrm.mod = 2; // mod=10, use disp32
+            *(unsigned*)stream = (unsigned)op.disp();
+            curRelOpnd[idx]= stream;
+            stream += 4;
+        }
+        modrm.rm = getHWRegIndex(op.base());
+    if (is_em64t_extra_reg(op.base())) {
+        prex->b = 1;
+    }
+        return stream;
+    }
+
+    // cool, we do have SIB.
+    ++stream; // bypass SIB in stream
+
+    // {E|R}SP cannot be scaled index, however, R12 which has the same index in modrm - can
+    assert(op.index() == RegName_Null || !equals(op.index(), REG_STACK));
+
+    // Only GPRegs can be encoded in the SIB
+    assert(op.base() == RegName_Null ||
+            getRegKind(op.base()) == OpndKind_GPReg);
+    assert(op.index() == RegName_Null ||
+            getRegKind(op.index()) == OpndKind_GPReg);
+
+    modrm.rm = 4;   // r/m = 100, means 'we have SIB here'
+    if (op.base() == RegName_Null) {
+        // no base.
+        // already checked above if
+        // the first if() //assert(op.index() != RegName_Null);
+
+        modrm.mod = 0;  // mod=00 - here it means 'no base, but disp32'
+        sib.base = 5;   // 101 with mod=00  ^^^
+
+        // encode at least fake disp32 to avoid having [base=ebp]
+        *(unsigned*)stream = op.disp();
+        curRelOpnd[idx]= stream;
+        stream += 4;
+
+        unsigned sc = op.scale();
+        if (sc == 1 || sc==0)   { sib.scale = 0; }    // SS=00
+        else if (sc == 2)       { sib.scale = 1; }    // SS=01
+        else if (sc == 4)       { sib.scale = 2; }    // SS=10
+        else if (sc == 8)       { sib.scale = 3; }    // SS=11
+        sib.index = getHWRegIndex(op.index());
+    if (is_em64t_extra_reg(op.index())) {
+        prex->x = 1;
+    }
+
+        return stream;
+    }
+
+    if (op.disp() == 0 && getHWRegIndex(op.base()) != getHWRegIndex(RegName_EBP)) {
+        modrm.mod = 0;  // mod=00, no disp
+    }
+    else if (disp_fits8) {
+        modrm.mod = 1;  // mod=01, use disp8
+        *(unsigned char*)stream = (unsigned char)op.disp();
+        curRelOpnd[idx]= stream;
+        stream += 1;
+    }
+    else {
+        modrm.mod = 2;  // mod=10, use disp32
+        *(unsigned*)stream = (unsigned)op.disp();
+        curRelOpnd[idx]= stream;
+        stream += 4;
+    }
+
+    if (op.index() == RegName_Null) {
+        assert(op.scale() == 0); // 'scale!=0' has no meaning without index
+        // the only reason we're here without index, is that we have {E|R}SP
+        // or R12 as a base. Another possible reason - EBP without a disp -
+        // is handled above by adding a fake disp8
+#ifdef _EM64T_
+        assert(op.base() != RegName_Null && (equals(op.base(), REG_STACK) ||
+                                             equals(op.base(), RegName_R12)));
+#else  // _EM64T_
+        assert(op.base() != RegName_Null && equals(op.base(), REG_STACK));
+#endif //_EM64T_
+        sib.scale = 0;  // SS = 00
+        sib.index = 4;  // SS + index=100 means 'no index'
+    }
+    else {
+        unsigned sc = op.scale();
+        if (sc == 1 || sc==0)   { sib.scale = 0; }    // SS=00
+        else if (sc == 2)       { sib.scale = 1; }    // SS=01
+        else if (sc == 4)       { sib.scale = 2; }    // SS=10
+        else if (sc == 8)       { sib.scale = 3; }    // SS=11
+        sib.index = getHWRegIndex(op.index());
+    if (is_em64t_extra_reg(op.index())) {
+        prex->x = 1;
+    }
+        // not an error by itself, but the usage of [index*1] instead
+        // of [base] is discouraged
+        assert(op.base() != RegName_Null || op.scale() != 1);
+    }
+    sib.base = getHWRegIndex(op.base());
+    if (is_em64t_extra_reg(op.base())) {
+    prex->b = 1;
+    }
+    return stream;
+}
+
+char * EncoderBase::nops(char * stream, unsigned howMany)
+{
+    // Recommended multi-byte NOPs from the Intel architecture manual
+    static const unsigned char nops[10][9] = {
+        { 0, },                                                     // 0, this line is dummy and not used in the loop below
+        { 0x90, },                                                  // 1-byte NOP
+        { 0x66, 0x90, },                                            // 2
+        { 0x0F, 0x1F, 0x00, },                                      // 3
+        { 0x0F, 0x1F, 0x40, 0x00, },                                // 4
+        { 0x0F, 0x1F, 0x44, 0x00, 0x00, },                          // 5
+        { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00, },                    // 6
+        { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00, },              // 7
+        { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, },        // 8
+        { 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 },   // 9-byte NOP
+    };
+
+    // Start from delivering the longest possible NOPs, then proceed with shorter ones
+    for (unsigned nopSize=9; nopSize!=0; nopSize--) {
+        while(howMany>=nopSize) {
+            const unsigned char* nopBytes = nops[nopSize];
+            for (unsigned i=0; i<nopSize; i++) {
+                stream[i] = nopBytes[i];
+            }
+            stream += nopSize;
+            howMany -= nopSize;
+        }
+    }
+    char* end = stream + howMany;
+    return end;
+}
+
+char * EncoderBase::prefix(char* stream, InstPrefix pref)
+{
+    if (pref== InstPrefix_Null) {
+        // nothing to do
+        return stream;
+    }
+    *stream = (char)pref;
+    return stream + 1;
+}
+
+
+/**
+ *
+ */
+bool EncoderBase::extAllowed(OpndExt opndExt, OpndExt instExt) {
+    if (instExt == opndExt || instExt == OpndExt_Any || opndExt == OpndExt_Any) {
+            return true;
+    }
+//asm("int3");
+assert(0);
+    return false;
+}
+
+static bool try_match(const EncoderBase::OpcodeDesc& odesc,
+                      const EncoderBase::Operands& opnds, bool strict) {
+
+    assert(odesc.roles.count == opnds.count());
+
+    for(unsigned j=0; j<odesc.roles.count; j++) {
+        // - the location must match exactly
+        if ((odesc.opnds[j].kind & opnds[j].kind()) != opnds[j].kind()) {
+            return false;
+        }
+        if (strict) {
+            // the size must match exactly
+            if (odesc.opnds[j].size != opnds[j].size()) {
+                return false;
+            }
+        }
+        else {
+            // must match only for def operands, and dont care about use ones
+            // situations like 'mov r8, imm32/mov r32, imm8' so the
+            // destination operand defines the overall size
+            if (EncoderBase::getOpndRoles(odesc.roles, j) & OpndRole_Def) {
+                if (odesc.opnds[j].size != opnds[j].size()) {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+//
+//Subhash implementaion - may be useful in case of many misses during fast
+//opcode lookup.
+//
+
+#ifdef ENCODER_USE_SUBHASH
+static unsigned subHash[32];
+
+static unsigned find(Mnemonic mn, unsigned hash)
+{
+    unsigned key = hash % COUNTOF(subHash);
+    unsigned pack = subHash[key];
+    unsigned _hash = pack & 0xFFFF;
+    if (_hash != hash) {
+        stat.miss(mn);
+        return EncoderBase::NOHASH;
+    }
+    unsigned _mn = (pack >> 24)&0xFF;
+    if (_mn != _mn) {
+        stat.miss(mn);
+        return EncoderBase::NOHASH;
+    }
+    unsigned idx = (pack >> 16) & 0xFF;
+    stat.hit(mn);
+    return idx;
+}
+
+static void put(Mnemonic mn, unsigned hash, unsigned idx)
+{
+    unsigned pack = hash | (idx<<16) | (mn << 24);
+    unsigned key = hash % COUNTOF(subHash);
+    subHash[key] = pack;
+}
+#endif
+
+const EncoderBase::OpcodeDesc *
+EncoderBase::lookup(Mnemonic mn, const Operands& opnds)
+{
+    const unsigned hash = opnds.hash();
+    unsigned opcodeIndex = opcodesHashMap[mn][hash];
+#ifdef ENCODER_USE_SUBHASH
+    if (opcodeIndex == NOHASH) {
+        opcodeIndex = find(mn, hash);
+    }
+#endif
+
+    if (opcodeIndex == NOHASH) {
+        // fast-path did no work. try to lookup sequentially
+        const OpcodeDesc * odesc = opcodes[mn];
+        int idx = -1;
+        bool found = false;
+        for (idx=0; !odesc[idx].last; idx++) {
+            const OpcodeDesc& opcode = odesc[idx];
+            if (opcode.platf == OpcodeInfo::decoder) {
+                continue;
+            }
+            if (opcode.roles.count != opnds.count()) {
+                continue;
+            }
+            if (try_match(opcode, opnds, true)) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            for (idx=0; !odesc[idx].last; idx++) {
+                const OpcodeDesc& opcode = odesc[idx];
+                if (opcode.platf == OpcodeInfo::decoder) {
+                    continue;
+                }
+                if (opcode.roles.count != opnds.count()) {
+                    continue;
+                }
+                if (try_match(opcode, opnds, false)) {
+                    found = true;
+                    break;
+                }
+            }
+        }
+        assert(found);
+        opcodeIndex = idx;
+#ifdef ENCODER_USE_SUBHASH
+        put(mn, hash, opcodeIndex);
+#endif
+    }
+    assert(opcodeIndex != NOHASH);
+    const OpcodeDesc * odesc = &opcodes[mn][opcodeIndex];
+    assert(!odesc->last);
+    assert(odesc->roles.count == opnds.count());
+    assert(odesc->platf != OpcodeInfo::decoder);
+#if !defined(_EM64T_)
+    // tuning was done for IA32 only, so no size restriction on EM64T
+    //assert(sizeof(OpcodeDesc)==128);
+#endif
+    return odesc;
+}
+
+char* EncoderBase::getOpndLocation(int index) {
+     assert(index < 3);
+     return curRelOpnd[index];
+}
+
+
+Mnemonic EncoderBase::str2mnemonic(const char * mn_name)
+{
+    for (unsigned m = 1; m<Mnemonic_Count; m++) {
+        if (!strcmpi(mnemonics[m].name, mn_name)) {
+            return (Mnemonic)m;
+        }
+    }
+    return Mnemonic_Null;
+}
+
+static const char * conditionStrings[ConditionMnemonic_Count] = {
+    "O",
+    "NO",
+    "B",
+    "AE",
+    "Z",
+    "NZ",
+    "BE",
+    "A",
+
+    "S",
+    "NS",
+    "P",
+    "NP",
+    "L",
+    "GE",
+    "LE",
+    "G",
+};
+
+const char * getConditionString(ConditionMnemonic cm) {
+    return conditionStrings[cm];
+}
+
+static const struct {
+        char            sizeString[12];
+        OpndSize        size;
+}
+sizes[] = {
+    { "Sz8", OpndSize_8 },
+    { "Sz16", OpndSize_16 },
+    { "Sz32", OpndSize_32 },
+    { "Sz64", OpndSize_64 },
+#if !defined(TESTING_ENCODER)
+    { "Sz80", OpndSize_80 },
+    { "Sz128", OpndSize_128 },
+#endif
+    { "SzAny", OpndSize_Any },
+};
+
+
+OpndSize getOpndSize(const char * sizeString)
+{
+    assert(sizeString);
+    for (unsigned i = 0; i<COUNTOF(sizes); i++) {
+        if (!strcmpi(sizeString, sizes[i].sizeString)) {
+            return sizes[i].size;
+        }
+    }
+    return OpndSize_Null;
+}
+
+const char * getOpndSizeString(OpndSize size) {
+    for( unsigned i = 0; i<COUNTOF(sizes); i++ ) {
+        if( sizes[i].size==size ) {
+            return sizes[i].sizeString;
+        }
+    }
+    return NULL;
+}
+
+static const struct {
+    char            kindString[16];
+    OpndKind        kind;
+}
+kinds[] = {
+    { "Null", OpndKind_Null },
+    { "GPReg", OpndKind_GPReg },
+    { "SReg", OpndKind_SReg },
+    { "FPReg", OpndKind_FPReg },
+    { "XMMReg", OpndKind_XMMReg },
+#ifdef _HAVE_MMX_
+    { "MMXReg", OpndKind_MMXReg },
+#endif
+    { "StatusReg", OpndKind_StatusReg },
+    { "Reg", OpndKind_Reg },
+    { "Imm", OpndKind_Imm },
+    { "Mem", OpndKind_Mem },
+    { "Any", OpndKind_Any },
+};
+
+const char * getOpndKindString(OpndKind kind)
+{
+    for (unsigned i = 0; i<COUNTOF(kinds); i++) {
+        if (kinds[i].kind==kind) {
+            return kinds[i].kindString;
+        }
+    }
+    return NULL;
+}
+
+OpndKind getOpndKind(const char * kindString)
+{
+    assert(kindString);
+    for (unsigned i = 0; i<COUNTOF(kinds); i++) {
+        if (!strcmpi(kindString, kinds[i].kindString)) {
+            return kinds[i].kind;
+        }
+    }
+    return OpndKind_Null;
+}
+
+/**
+ * A mapping between register string representation and its RegName constant.
+ */
+static const struct {
+        char    regstring[7];
+        RegName regname;
+}
+
+registers[] = {
+#ifdef _EM64T_
+    {"RAX",         RegName_RAX},
+    {"RBX",         RegName_RBX},
+    {"RCX",         RegName_RCX},
+    {"RDX",         RegName_RDX},
+    {"RBP",         RegName_RBP},
+    {"RSI",         RegName_RSI},
+    {"RDI",         RegName_RDI},
+    {"RSP",         RegName_RSP},
+    {"R8",          RegName_R8},
+    {"R9",          RegName_R9},
+    {"R10",         RegName_R10},
+    {"R11",         RegName_R11},
+    {"R12",         RegName_R12},
+    {"R13",         RegName_R13},
+    {"R14",         RegName_R14},
+    {"R15",         RegName_R15},
+#endif
+
+    {"EAX",         RegName_EAX},
+    {"ECX",         RegName_ECX},
+    {"EDX",         RegName_EDX},
+    {"EBX",         RegName_EBX},
+    {"ESP",         RegName_ESP},
+    {"EBP",         RegName_EBP},
+    {"ESI",         RegName_ESI},
+    {"EDI",         RegName_EDI},
+#ifdef _EM64T_
+    {"R8D",         RegName_R8D},
+    {"R9D",         RegName_R9D},
+    {"R10D",        RegName_R10D},
+    {"R11D",        RegName_R11D},
+    {"R12D",        RegName_R12D},
+    {"R13D",        RegName_R13D},
+    {"R14D",        RegName_R14D},
+    {"R15D",        RegName_R15D},
+#endif
+
+    {"AX",          RegName_AX},
+    {"CX",          RegName_CX},
+    {"DX",          RegName_DX},
+    {"BX",          RegName_BX},
+    {"SP",          RegName_SP},
+    {"BP",          RegName_BP},
+    {"SI",          RegName_SI},
+    {"DI",          RegName_DI},
+
+    {"AL",          RegName_AL},
+    {"CL",          RegName_CL},
+    {"DL",          RegName_DL},
+    {"BL",          RegName_BL},
+#if !defined(_EM64T_)
+    {"AH",          RegName_AH},
+    {"CH",          RegName_CH},
+    {"DH",          RegName_DH},
+    {"BH",          RegName_BH},
+#else
+    {"SPL",         RegName_SPL},
+    {"BPL",         RegName_BPL},
+    {"SIL",         RegName_SIL},
+    {"DIL",         RegName_DIL},
+    {"R8L",         RegName_R8L},
+    {"R9L",         RegName_R9L},
+    {"R10L",        RegName_R10L},
+    {"R11L",        RegName_R11L},
+    {"R12L",        RegName_R12L},
+    {"R13L",        RegName_R13L},
+    {"R14L",        RegName_R14L},
+    {"R15L",        RegName_R15L},
+#endif
+    {"ES",          RegName_ES},
+    {"CS",          RegName_CS},
+    {"SS",          RegName_SS},
+    {"DS",          RegName_DS},
+    {"FS",          RegName_FS},
+    {"GS",          RegName_GS},
+
+    {"FP0",         RegName_FP0},
+/*
+    {"FP1",         RegName_FP1},
+    {"FP2",         RegName_FP2},
+    {"FP3",         RegName_FP3},
+    {"FP4",         RegName_FP4},
+    {"FP5",         RegName_FP5},
+    {"FP6",         RegName_FP6},
+    {"FP7",         RegName_FP7},
+*/
+    {"FP0S",        RegName_FP0S},
+    {"FP1S",        RegName_FP1S},
+    {"FP2S",        RegName_FP2S},
+    {"FP3S",        RegName_FP3S},
+    {"FP4S",        RegName_FP4S},
+    {"FP5S",        RegName_FP5S},
+    {"FP6S",        RegName_FP6S},
+    {"FP7S",        RegName_FP7S},
+
+    {"FP0D",        RegName_FP0D},
+    {"FP1D",        RegName_FP1D},
+    {"FP2D",        RegName_FP2D},
+    {"FP3D",        RegName_FP3D},
+    {"FP4D",        RegName_FP4D},
+    {"FP5D",        RegName_FP5D},
+    {"FP6D",        RegName_FP6D},
+    {"FP7D",        RegName_FP7D},
+
+    {"XMM0",        RegName_XMM0},
+    {"XMM1",        RegName_XMM1},
+    {"XMM2",        RegName_XMM2},
+    {"XMM3",        RegName_XMM3},
+    {"XMM4",        RegName_XMM4},
+    {"XMM5",        RegName_XMM5},
+    {"XMM6",        RegName_XMM6},
+    {"XMM7",        RegName_XMM7},
+#ifdef _EM64T_
+    {"XMM8",       RegName_XMM8},
+    {"XMM9",       RegName_XMM9},
+    {"XMM10",      RegName_XMM10},
+    {"XMM11",      RegName_XMM11},
+    {"XMM12",      RegName_XMM12},
+    {"XMM13",      RegName_XMM13},
+    {"XMM14",      RegName_XMM14},
+    {"XMM15",      RegName_XMM15},
+#endif
+
+
+    {"XMM0S",       RegName_XMM0S},
+    {"XMM1S",       RegName_XMM1S},
+    {"XMM2S",       RegName_XMM2S},
+    {"XMM3S",       RegName_XMM3S},
+    {"XMM4S",       RegName_XMM4S},
+    {"XMM5S",       RegName_XMM5S},
+    {"XMM6S",       RegName_XMM6S},
+    {"XMM7S",       RegName_XMM7S},
+#ifdef _EM64T_
+    {"XMM8S",       RegName_XMM8S},
+    {"XMM9S",       RegName_XMM9S},
+    {"XMM10S",      RegName_XMM10S},
+    {"XMM11S",      RegName_XMM11S},
+    {"XMM12S",      RegName_XMM12S},
+    {"XMM13S",      RegName_XMM13S},
+    {"XMM14S",      RegName_XMM14S},
+    {"XMM15S",      RegName_XMM15S},
+#endif
+
+    {"XMM0D",       RegName_XMM0D},
+    {"XMM1D",       RegName_XMM1D},
+    {"XMM2D",       RegName_XMM2D},
+    {"XMM3D",       RegName_XMM3D},
+    {"XMM4D",       RegName_XMM4D},
+    {"XMM5D",       RegName_XMM5D},
+    {"XMM6D",       RegName_XMM6D},
+    {"XMM7D",       RegName_XMM7D},
+#ifdef _EM64T_
+    {"XMM8D",       RegName_XMM8D},
+    {"XMM9D",       RegName_XMM9D},
+    {"XMM10D",      RegName_XMM10D},
+    {"XMM11D",      RegName_XMM11D},
+    {"XMM12D",      RegName_XMM12D},
+    {"XMM13D",      RegName_XMM13D},
+    {"XMM14D",      RegName_XMM14D},
+    {"XMM15D",      RegName_XMM15D},
+#endif
+
+    {"EFLGS",       RegName_EFLAGS},
+};
+
+
+const char * getRegNameString(RegName reg)
+{
+    for (unsigned i = 0; i<COUNTOF(registers); i++) {
+        if (registers[i].regname == reg) {
+            return registers[i].regstring;
+        }
+    }
+    return "(null)";
+}
+
+RegName getRegName(const char * regname)
+{
+    if (NULL == regname) {
+        return RegName_Null;
+    }
+
+    for (unsigned i = 0; i<COUNTOF(registers); i++) {
+        if (!strcmpi(regname,registers[i].regstring)) {
+            return registers[i].regname;
+        }
+    }
+    return RegName_Null;
+}
+
+ENCODER_NAMESPACE_END
diff --git a/libpixelflinger/codeflinger/x86/libenc/enc_base.h b/libpixelflinger/codeflinger/x86/libenc/enc_base.h
new file mode 100644
index 0000000..fa1062d
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/enc_base.h
@@ -0,0 +1,748 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+
+/**
+ * @file
+ * @brief Main encoding routines and structures.
+ */
+
+#ifndef __ENC_BASE_H_INCLUDED__
+#define __ENC_BASE_H_INCLUDED__
+
+#include "enc_defs.h"
+
+
+#include <stdlib.h>
+#include <assert.h>
+#include <memory.h>
+
+ENCODER_NAMESPACE_START
+struct MnemonicInfo;
+struct OpcodeInfo;
+struct Rex;
+
+/**
+ * @brief Basic facilities for generation of processor's instructions.
+ *
+ * The class EncoderBase represents the basic facilities for the encoding of
+ * processor's instructions on IA32 and EM64T platforms.
+ *
+ * The class provides general interface to generate the instructions as well
+ * as to retrieve some static data about instructions (number of arguments,
+ * their roles, etc).
+ *
+ * Currently, the EncoderBase class is used for both LIL and Jitrino code
+ * generators. Each of these code generators has its own wrapper to adapt
+ * this general interface for specific needs - see encoder.h for LIL wrappers
+ * and Ia32Encoder.h for Jitrino's adapter.
+ *
+ * Interface is provided through static methods, no instances of EncoderBase
+ * to be created.
+ *
+ * @todo RIP-based addressing on EM64T - it's not yet supported currently.
+ */
+class EncoderBase {
+public:
+    class Operands;
+    struct MnemonicDesc;
+    /**
+     * @brief Generates processor's instruction.
+     *
+     * @param stream - a buffer to generate into
+     * @param mn - \link Mnemonic mnemonic \endlink of the instruction
+     * @param opnds - operands for the instruction
+     * @returns (stream + length of the just generated instruction)
+     */
+    static char * encode(char * stream, Mnemonic mn, const Operands& opnds);
+    static char * getOpndLocation(int index);
+
+    /**
+     * @brief Generates the smallest possible number of NOP-s.
+     *
+     * Effectively generates the smallest possible number of instructions,
+     * which are NOP-s for CPU. Normally used to make a code alignment.
+     *
+     * The method inserts exactly number of bytes specified. It's a caller's
+     * responsibility to make sure the buffer is big enough.
+     *
+     * @param stream - buffer where to generate code into, can not be NULL
+     * @param howMany - how many bytes to fill with NOP-s
+     * @return \c (stream+howMany)
+     */
+    static char * nops(char * stream, unsigned howMany);
+
+    /**
+     * @brief Inserts a prefix into the code buffer.
+     *
+     * The method writes no more than one byte into the buffer. This is a
+     * caller's responsibility to make sure the buffer is big enough.
+     *
+     * @param stream - buffer where to insert the prefix
+     * @param pref - prefix to be inserted. If it's InstPrefix_Null, then
+     *        no action performed and return value is \c stream.
+     * @return \c (stream+1) if pref is not InstPrefix_Null, or \c stream
+     *         otherwise
+     */
+     static char * prefix(char* stream, InstPrefix pref);
+
+    /**
+     * @brief Determines if operand with opndExt suites the position with instExt.
+     */
+    static bool extAllowed(OpndExt opndExt, OpndExt instExt);
+
+    /**
+     * @brief Returns MnemonicDesc by the given Mnemonic.
+     */
+    static const MnemonicDesc * getMnemonicDesc(Mnemonic mn)
+    {
+        assert(mn < Mnemonic_Count);
+        return mnemonics + mn;
+    }
+
+    /**
+     * @brief Returns a Mnemonic for the given name.
+     *
+     * The lookup is case insensitive, if no mnemonic found for the given
+     * string, then Mnemonic_Null returned.
+     */
+    static Mnemonic str2mnemonic(const char * mn_name);
+
+    /**
+     * @brief Returns a string representation of the given Mnemonic.
+     *
+     * If invalid mnemonic passed, then the behavior is unpredictable.
+     */
+    static const char * getMnemonicString(Mnemonic mn)
+    {
+        return getMnemonicDesc(mn)->name;
+    }
+
+    static const char * toStr(Mnemonic mn)
+    {
+        return getMnemonicDesc(mn)->name;
+    }
+
+
+    /**
+     * @brief Description of operand.
+     *
+     * Description of an operand in opcode - its kind, size or RegName if
+     * operand must be a particular register.
+     */
+    struct OpndDesc {
+        /**
+         * @brief Location of the operand.
+         *
+         * May be a mask, i.e. OpndKind_Imm|OpndKind_Mem.
+         */
+        OpndKind        kind;
+        /**
+         * @brief Size of the operand.
+         */
+        OpndSize        size;
+        /**
+         * @brief Extention of the operand.
+         */
+        OpndExt         ext;
+        /**
+         * @brief Appropriate RegName if operand must reside on a particular
+         *        register (i.e. CWD/CDQ instructions), RegName_Null
+         *        otherwise.
+         */
+        RegName         reg;
+    };
+
+    /**
+     * @brief Description of operands' roles in instruction.
+     */
+    struct OpndRolesDesc {
+        /**
+         * @brief Total number of operands in the operation.
+         */
+        unsigned                count;
+        /**
+         * @brief Number of defs in the operation.
+         */
+        unsigned                defCount;
+        /**
+         * @brief Number of uses in the operation.
+         */
+        unsigned                useCount;
+        /**
+         * @brief Operand roles, bit-packed.
+         *
+         * A bit-packed info about operands' roles. Each operand's role is
+         * described by two bits, counted from right-to-left - the less
+         * significant bits (0,1) represent operand#0.
+         *
+         * The mask is build by ORing #OpndRole_Def and #OpndRole_Use
+         * appropriately and shifting left, i.e. operand#0's role would be
+         * - '(OpndRole_Def|OpndRole_Use)'
+         * - opnd#1's role would be 'OpndRole_Use<<2'
+         * - and operand#2's role would be, say, 'OpndRole_Def<<4'.
+         */
+        unsigned                roles;
+    };
+
+    /**
+     * @brief Extracts appropriate OpndRole for a given operand.
+     *
+     * The order of operands is left-to-right, i.e. for MOV, it
+     * would be 'MOV op0, op1'
+     */
+    static OpndRole getOpndRoles(OpndRolesDesc ord, unsigned idx)
+    {
+        assert(idx < ord.count);
+        return (OpndRole)(ord.roles>>((ord.count-1-idx)*2) & 0x3);
+    }
+
+    /**
+     * @brief Defines the maximum number of operands for an opcode.
+     *
+     * The 3 mostly comes from IDIV/IMUL which both may have up to
+     * 3 operands.
+     */
+    static const unsigned int MAX_NUM_OPCODE_OPERANDS = 3;
+
+    /**
+     * @brief Info about single opcode - its opcode bytes, operands,
+     *        operands' roles.
+     */
+   union OpcodeDesc {
+       char dummy[128]; // To make total size a power of 2
+
+       struct {
+           /**
+           * @brief Raw opcode bytes.
+           *
+           * 'Raw' opcode bytes which do not require any analysis and are
+           * independent from arguments/sizes/etc (may include opcode size
+           * prefix).
+           */
+           char        opcode[5];
+           unsigned    opcode_len;
+           unsigned    aux0;
+           unsigned    aux1;
+           /**
+           * @brief Info about opcode's operands.
+           */
+           OpndDesc        opnds[MAX_NUM_OPCODE_OPERANDS];
+           unsigned        first_opnd;
+           /**
+           * @brief Info about operands - total number, number of uses/defs,
+           *        operands' roles.
+           */
+           OpndRolesDesc   roles;
+           /**
+           * @brief If not zero, then this is final OpcodeDesc structure in
+           *        the list of opcodes for a given mnemonic.
+           */
+           char            last;
+           char            platf;
+       };
+   };
+public:
+    /**
+     * @brief General info about mnemonic.
+     */
+    struct MnemonicDesc {
+        /**
+        * @brief The mnemonic itself.
+        */
+        Mnemonic        mn;
+        /**
+        * Various characteristics of mnemonic.
+        * @see MF_
+         */
+        unsigned    flags;
+        /**
+         * @brief Operation's operand's count and roles.
+         *
+         * For the operations whose opcodes may use different number of
+         * operands (i.e. IMUL/SHL) either most common value used, or empty
+         * value left.
+         */
+        OpndRolesDesc   roles;
+        /**
+         * @brief Print name of the mnemonic.
+         */
+        const char *    name;
+    };
+
+
+    /**
+     * @brief Magic number, shows a maximum value a hash code can take.
+     *
+     * For meaning and arithmetics see enc_tabl.cpp.
+     *
+     * The value was increased from '5155' to '8192' to make it aligned
+     * for faster access in EncoderBase::lookup().
+     *
+     * It was further increased to 16384 as support for 3 operand opcodes
+     * with XMM registers were added
+     */
+    static const unsigned int               HASH_MAX = 16384; //5155;
+    /**
+     * @brief Empty value, used in hash-to-opcode map to show an empty slot.
+     */
+    static const unsigned char              NOHASH = 0xFF;
+    /**
+     * @brief The name says it all.
+     */
+    static const unsigned char              HASH_BITS_PER_OPERAND = 5;
+
+    /**
+     * @brief Contains info about a single instructions's operand - its
+     *        location, size and a value for immediate or RegName for
+     *        register operands.
+     */
+    class Operand {
+    public:
+        /**
+         * @brief Initializes the instance with empty size and kind.
+         */
+        Operand() : m_kind(OpndKind_Null), m_size(OpndSize_Null), m_ext(OpndExt_None), m_need_rex(false) {}
+        /**
+         * @brief Creates register operand from given RegName.
+         */
+        Operand(RegName reg, OpndExt ext = OpndExt_None) : m_kind(getRegKind(reg)),
+                               m_size(getRegSize(reg)),
+                               m_ext(ext), m_reg(reg)
+        {
+            hash_it();
+        }
+        /**
+         * @brief Creates register operand from given RegName and with the
+         *        specified size and kind.
+         *
+         * Used to speedup Operand creation as there is no need to extract
+         * size and kind from the RegName.
+         * The provided size and kind must match the RegName's ones though.
+         */
+        Operand(OpndSize sz, OpndKind kind, RegName reg, OpndExt ext = OpndExt_None) :
+            m_kind(kind), m_size(sz), m_ext(ext), m_reg(reg)
+        {
+            assert(m_size == getRegSize(reg));
+            assert(m_kind == getRegKind(reg));
+            hash_it();
+        }
+        /**
+         * @brief Creates immediate operand with the given size and value.
+         */
+        Operand(OpndSize size, long long ival, OpndExt ext = OpndExt_None) :
+            m_kind(OpndKind_Imm), m_size(size), m_ext(ext), m_imm64(ival)
+        {
+            hash_it();
+        }
+        /**
+         * @brief Creates immediate operand of OpndSize_32.
+         */
+        Operand(int ival, OpndExt ext = OpndExt_None) :
+            m_kind(OpndKind_Imm), m_size(OpndSize_32), m_ext(ext), m_imm64(ival)
+        {
+            hash_it();
+        }
+        /**
+         * @brief Creates immediate operand of OpndSize_16.
+         */
+        Operand(short ival, OpndExt ext = OpndExt_None) :
+            m_kind(OpndKind_Imm), m_size(OpndSize_16), m_ext(ext), m_imm64(ival)
+        {
+            hash_it();
+        }
+
+        /**
+         * @brief Creates immediate operand of OpndSize_8.
+         */
+        Operand(char ival, OpndExt ext = OpndExt_None) :
+            m_kind(OpndKind_Imm), m_size(OpndSize_8), m_ext(ext), m_imm64(ival)
+        {
+            hash_it();
+        }
+
+        /**
+         * @brief Creates memory operand.
+         */
+        Operand(OpndSize size, RegName base, RegName index, unsigned scale,
+                int disp, OpndExt ext = OpndExt_None) : m_kind(OpndKind_Mem), m_size(size), m_ext(ext)
+        {
+            m_base = base;
+            m_index = index;
+            m_scale = scale;
+            m_disp = disp;
+            hash_it();
+        }
+
+        /**
+         * @brief Creates memory operand with only base and displacement.
+         */
+        Operand(OpndSize size, RegName base, int disp, OpndExt ext = OpndExt_None) :
+            m_kind(OpndKind_Mem), m_size(size), m_ext(ext)
+        {
+            m_base = base;
+            m_index = RegName_Null;
+            m_scale = 0;
+            m_disp = disp;
+            hash_it();
+        }
+        //
+        // general info
+        //
+        /**
+         * @brief Returns kind of the operand.
+         */
+        OpndKind kind(void) const { return m_kind; }
+        /**
+         * @brief Returns size of the operand.
+         */
+        OpndSize size(void) const { return m_size; }
+        /**
+         * @brief Returns extention of the operand.
+         */
+        OpndExt ext(void) const { return m_ext; }
+        /**
+         * @brief Returns hash of the operand.
+         */
+        unsigned hash(void) const { return m_hash; }
+        //
+#ifdef _EM64T_
+        bool need_rex(void) const { return m_need_rex; }
+#else
+        bool need_rex(void) const { return false; }
+#endif
+        /**
+         * @brief Tests whether operand is memory operand.
+         */
+        bool is_mem(void) const { return is_placed_in(OpndKind_Mem); }
+        /**
+         * @brief Tests whether operand is immediate operand.
+         */
+        bool is_imm(void) const { return is_placed_in(OpndKind_Imm); }
+        /**
+         * @brief Tests whether operand is register operand.
+         */
+        bool is_reg(void) const { return is_placed_in(OpndKind_Reg); }
+        /**
+         * @brief Tests whether operand is general-purpose register operand.
+         */
+        bool is_gpreg(void) const { return is_placed_in(OpndKind_GPReg); }
+        /**
+         * @brief Tests whether operand is float-point pseudo-register operand.
+         */
+        bool is_fpreg(void) const { return is_placed_in(OpndKind_FPReg); }
+        /**
+         * @brief Tests whether operand is XMM register operand.
+         */
+        bool is_xmmreg(void) const { return is_placed_in(OpndKind_XMMReg); }
+#ifdef _HAVE_MMX_
+        /**
+         * @brief Tests whether operand is MMX register operand.
+         */
+        bool is_mmxreg(void) const { return is_placed_in(OpndKind_MMXReg); }
+#endif
+        /**
+         * @brief Tests whether operand is signed immediate operand.
+         */
+        //bool is_signed(void) const { assert(is_imm()); return m_is_signed; }
+
+        /**
+         * @brief Returns base of memory operand (RegName_Null if not memory).
+         */
+        RegName base(void) const { return is_mem() ? m_base : RegName_Null; }
+        /**
+         * @brief Returns index of memory operand (RegName_Null if not memory).
+         */
+        RegName index(void) const { return is_mem() ? m_index : RegName_Null; }
+        /**
+         * @brief Returns scale of memory operand (0 if not memory).
+         */
+        unsigned scale(void) const { return is_mem() ? m_scale : 0; }
+        /**
+         * @brief Returns displacement of memory operand (0 if not memory).
+         */
+        int disp(void) const { return is_mem() ? m_disp : 0; }
+        /**
+         * @brief Returns RegName of register operand (RegName_Null if not
+         *        register).
+         */
+        RegName reg(void) const { return is_reg() ? m_reg : RegName_Null; }
+        /**
+         * @brief Returns value of immediate operand (0 if not immediate).
+         */
+        long long imm(void) const { return is_imm() ? m_imm64 : 0; }
+    private:
+        bool is_placed_in(OpndKind kd) const
+        {
+                return kd == OpndKind_Reg ?
+                        m_kind == OpndKind_GPReg ||
+#ifdef _HAVE_MMX_
+                        m_kind == OpndKind_MMXReg ||
+#endif
+                        m_kind == OpndKind_FPReg ||
+                        m_kind == OpndKind_XMMReg
+                        : kd == m_kind;
+        }
+        void hash_it(void)
+        {
+            m_hash = get_size_hash(m_size) | get_kind_hash(m_kind);
+#ifdef _EM64T_
+            m_need_rex = false;
+            if (is_reg() && is_em64t_extra_reg(m_reg)) {
+                m_need_rex = true;
+            }
+            else if (is_mem() && (is_em64t_extra_reg(m_base) ||
+                                  is_em64t_extra_reg(m_index))) {
+                m_need_rex = true;
+            }
+#endif
+        }
+        // general info
+        OpndKind    m_kind;
+        OpndSize    m_size;
+        OpndExt     m_ext;
+        // complex address form support
+        RegName     m_base;
+        RegName     m_index;
+        unsigned    m_scale;
+        union {
+            int         m_disp;
+            RegName     m_reg;
+            long long   m_imm64;
+        };
+        unsigned    m_hash;
+        bool        m_need_rex;
+        friend class EncoderBase::Operands;
+    };
+    /**
+     * @brief Simple container for up to 3 Operand-s.
+     */
+    class Operands {
+    public:
+        Operands(void)
+        {
+            clear();
+        }
+        Operands(const Operand& op0)
+        {
+            clear();
+            add(op0);
+        }
+
+        Operands(const Operand& op0, const Operand& op1)
+        {
+            clear();
+            add(op0); add(op1);
+        }
+
+        Operands(const Operand& op0, const Operand& op1, const Operand& op2)
+        {
+            clear();
+            add(op0); add(op1); add(op2);
+        }
+
+        unsigned count(void) const { return m_count; }
+        unsigned hash(void) const { return m_hash; }
+        const Operand& operator[](unsigned idx) const
+        {
+            assert(idx<m_count);
+            return m_operands[idx];
+        }
+
+        void add(const Operand& op)
+        {
+            assert(m_count < COUNTOF(m_operands));
+            m_hash = (m_hash<<HASH_BITS_PER_OPERAND) | op.hash();
+            m_operands[m_count++] = op;
+            m_need_rex = m_need_rex || op.m_need_rex;
+        }
+#ifdef _EM64T_
+        bool need_rex(void) const { return m_need_rex; }
+#else
+        bool need_rex(void) const { return false; }
+#endif
+        void clear(void)
+        {
+            m_count = 0; m_hash = 0; m_need_rex = false;
+        }
+    private:
+        unsigned    m_count;
+        Operand     m_operands[COUNTOF( ((OpcodeDesc*)NULL)->opnds )];
+        unsigned    m_hash;
+        bool        m_need_rex;
+    };
+public:
+#ifdef _DEBUG
+    /**
+     * Verifies some presumptions about encoding data table.
+     * Called automaticaly during statics initialization.
+     */
+    static int verify(void);
+#endif
+
+private:
+    /**
+     * @brief Returns found OpcodeDesc by the given Mnemonic and operands.
+     */
+    static const OpcodeDesc * lookup(Mnemonic mn, const Operands& opnds);
+    /**
+     * @brief Encodes mod/rm byte.
+     */
+    static char* encodeModRM(char* stream, const Operands& opnds,
+                             unsigned idx, const OpcodeDesc * odesc, Rex * prex);
+    /**
+     * @brief Encodes special things of opcode description - '/r', 'ib', etc.
+     */
+    static char* encode_aux(char* stream, unsigned aux,
+                            const Operands& opnds, const OpcodeDesc * odesc,
+                            unsigned * pargsCount, Rex* prex);
+#ifdef _EM64T_
+    /**
+     * @brief Returns true if the 'reg' argument represents one of the new
+     *        EM64T registers - R8(D)-R15(D).
+     *
+     * The 64 bits versions of 'old-fashion' registers, i.e. RAX are not
+     * considered as 'extra'.
+     */
+    static bool is_em64t_extra_reg(const RegName reg)
+    {
+        if (needs_rex_r(reg)) {
+            return true;
+        }
+        if (RegName_SPL <= reg && reg <= RegName_R15L) {
+            return true;
+        }
+        return false;
+    }
+    static bool needs_rex_r(const RegName reg)
+    {
+        if (RegName_R8 <= reg && reg <= RegName_R15) {
+            return true;
+        }
+        if (RegName_R8D <= reg && reg <= RegName_R15D) {
+            return true;
+        }
+        if (RegName_R8S <= reg && reg <= RegName_R15S) {
+            return true;
+        }
+        if (RegName_R8L <= reg && reg <= RegName_R15L) {
+            return true;
+        }
+        if (RegName_XMM8 <= reg && reg <= RegName_XMM15) {
+            return true;
+        }
+        if (RegName_XMM8D <= reg && reg <= RegName_XMM15D) {
+            return true;
+        }
+        if (RegName_XMM8S <= reg && reg <= RegName_XMM15S) {
+            return true;
+        }
+        return false;
+    }
+    /**
+     * @brief Returns an 'processor's index' of the register - the index
+     *        used to encode the register in ModRM/SIB bytes.
+     *
+     * For the new EM64T registers the 'HW index' differs from the index
+     * encoded in RegName. For old-fashion registers it's effectively the
+     * same as ::getRegIndex(RegName).
+     */
+    static unsigned char getHWRegIndex(const RegName reg)
+    {
+        if (getRegKind(reg) != OpndKind_GPReg) {
+            return getRegIndex(reg);
+        }
+        if (RegName_SPL <= reg && reg<=RegName_DIL) {
+            return getRegIndex(reg);
+        }
+        if (RegName_R8L<= reg && reg<=RegName_R15L) {
+            return getRegIndex(reg) - getRegIndex(RegName_R8L);
+        }
+        return is_em64t_extra_reg(reg) ?
+                getRegIndex(reg)-getRegIndex(RegName_R8D) : getRegIndex(reg);
+    }
+#else
+    static unsigned char getHWRegIndex(const RegName reg)
+    {
+        return getRegIndex(reg);
+    }
+    static bool is_em64t_extra_reg(const RegName reg)
+    {
+        return false;
+    }
+#endif
+public:
+    static unsigned char get_size_hash(OpndSize size) {
+        return (size <= OpndSize_64) ? size_hash[size] : 0xFF;
+    }
+    static unsigned char get_kind_hash(OpndKind kind) {
+        return (kind <= OpndKind_Mem) ? kind_hash[kind] : 0xFF;
+    }
+
+    /**
+     * @brief A table used for the fast computation of hash value.
+     *
+     * A change must be strictly balanced with hash-related functions and data
+     * in enc_base.h/.cpp.
+     */
+    static const unsigned char size_hash[OpndSize_64+1];
+    /**
+     * @brief A table used for the fast computation of hash value.
+     *
+     * A change must be strictly balanced with hash-related functions and data
+     * in enc_base.h/.cpp.
+     */
+    static const unsigned char kind_hash[OpndKind_Mem+1];
+    /**
+     * @brief Maximum number of opcodes used for a single mnemonic.
+     *
+     * No arithmetics behind the number, simply estimated.
+     */
+    static const unsigned int   MAX_OPCODES = 32; //20;
+    /**
+     * @brief Mapping between operands hash code and operands.
+     */
+    static unsigned char    opcodesHashMap[Mnemonic_Count][HASH_MAX];
+    /**
+     * @brief Array of mnemonics.
+     */
+    static MnemonicDesc         mnemonics[Mnemonic_Count];
+    /**
+     * @brief Array of available opcodes.
+     */
+    static OpcodeDesc opcodes[Mnemonic_Count][MAX_OPCODES];
+
+    static int buildTable(void);
+    static void buildMnemonicDesc(const MnemonicInfo * minfo);
+    /**
+     * @brief Computes hash value for the given operands.
+     */
+    static unsigned short getHash(const OpcodeInfo* odesc);
+    /**
+     * @brief Dummy variable, for automatic invocation of buildTable() at
+     *        startup.
+     */
+    static int dummy;
+
+    static char * curRelOpnd[3];
+};
+
+ENCODER_NAMESPACE_END
+
+#endif // ifndef __ENC_BASE_H_INCLUDED__
diff --git a/libpixelflinger/codeflinger/x86/libenc/enc_defs.h b/libpixelflinger/codeflinger/x86/libenc/enc_defs.h
new file mode 100644
index 0000000..10409d2
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/enc_defs.h
@@ -0,0 +1,786 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+#ifndef _ENCODER_DEFS_H_
+#define _ENCODER_DEFS_H_
+
+
+// Used to isolate experimental or being tuned encoder into a separate
+// namespace so it can coexist with a stable one in the same bundle.
+#ifdef ENCODER_ISOLATE
+    #define ENCODER_NAMESPACE_START namespace enc_ia32 {
+    #define ENCODER_NAMESPACE_END };
+#else
+    #define ENCODER_NAMESPACE_START
+    #define ENCODER_NAMESPACE_END
+#endif
+
+#include <assert.h>
+#include "enc_defs_ext.h"
+
+#ifndef COUNTOF
+    /**
+     * Number of items in an array.
+     */
+    #define COUNTOF(a)      (sizeof(a)/sizeof(a[0]))
+#endif
+
+#ifdef _EM64T_
+    /**
+     * A stack pointer of default platform's size.
+     */
+    #define REG_STACK       RegName_RSP
+    /**
+     * A max GP register (with a highest index number)
+     */
+    #define REG_MAX         RegName_R15
+    /**
+     * Total number of GP registers including stack pointer.
+     */
+    #define MAX_REGS        15
+#else
+    #define REG_STACK       RegName_ESP
+    #define REG_MAX         RegName_EDI
+    #define MAX_REGS        8
+#endif
+
+ENCODER_NAMESPACE_START
+
+/**
+ * A number of bytes 'eaten' by an ordinary PUSH/POP.
+ */
+#define STACK_SLOT_SIZE (sizeof(void*))
+
+
+/**
+ * A recommended by Intel Arch Manual aligment for instructions that
+ * are targets for jmps.
+ */
+#define JMP_TARGET_ALIGMENT     (16)
+/**
+ * A maximum possible size of native instruction.
+ */
+#define MAX_NATIVE_INST_SIZE (15)
+/**
+ * The enum OpndKind describes an operand's location - memory, immediate or a register.
+ * It can be used as a bit mask.
+ */
+typedef enum OpndKind {
+    /**
+     * A change must be balanced with at least the following places:
+     *              Ia32::Constraint-s use the OpndKind as a mask
+     *              encoder.cpp & encoder_master_info.cpp uses OpndKind as an index for hashing
+     *              - perhaps there are much more places
+     *
+     * NOTE: an MMXReg kind is incompatible with the current constraints framework,
+     *              as it's not encoded as a mask.
+     */
+    OpndKind_Null=0,
+    OpndKind_GPReg          = 0x01, OpndKind_MinRegKind = OpndKind_GPReg,
+    OpndKind_SReg           = 0x02,
+#ifdef _HAVE_MMX_
+    OpndKind_MMXReg         = 0x03,
+#endif
+    OpndKind_FPReg          = 0x04,
+    OpndKind_XMMReg         = 0x08,
+    OpndKind_OtherReg       = 0x10,
+    OpndKind_StatusReg      = OpndKind_OtherReg,
+    OpndKind_MaxRegKind     = OpndKind_StatusReg,   // a max existing kind of register
+    OpndKind_MaxReg,                                // -'- + 1 to be used in array defs
+    //
+    OpndKind_Immediate      = 0x20, OpndKind_Imm=OpndKind_Immediate,
+    OpndKind_Memory         = 0x40, OpndKind_Mem=OpndKind_Memory,
+    //
+    OpndKind_Reg            = 0x1F,
+    OpndKind_Any            = 0x7F,
+    // syntetic constants. Normally not used anywhere, but are used for
+    // human-readable showing under the debugger
+    OpndKind_GPReg_Mem      = OpndKind_GPReg|OpndKind_Mem,
+#ifdef _HAVE_MMX_
+    OpndKind_MMXReg_Mem     = OpndKind_MMXReg|OpndKind_Mem,
+#endif
+    OpndKind_XMMReg_Mem     = OpndKind_XMMReg|OpndKind_Mem,
+} OpndKind;
+
+/**
+ * Defines type of extention allowed for particular operand.
+ * For example imul r32,r_m32,imm8 sign extend imm8 before performing multiplication.
+ * To satisfy instruction constraints immediate operand should be either OpndExt_Signed
+ * or OpndExt_Any.
+ */
+typedef enum OpndExt {
+    OpndExt_None    = 0x0,
+    OpndExt_Signed  = 0x1,
+    OpndExt_Zero    = 0x2,
+    OpndExt_Any     = 0x3,
+}OpndExt;
+
+/**
+ * enum OpndRole defines the role of an operand in an instruction
+ * Can be used as mask to combine def and use. The complete def+use
+ * info can be combined in 2 bits which is used, say in Encoder::OpndRole.
+ */
+//TODO: this duplicates an Role used in the Ia32::Inst. That duplicate enum should be removed.
+typedef enum OpndRole {
+    OpndRole_Null=0,
+    OpndRole_Use=0x1,
+    OpndRole_Def=0x2,
+    OpndRole_UseDef=OpndRole_Use|OpndRole_Def,
+    OpndRole_All=0xffff,
+} OpndRole;
+
+
+#define REGNAME(k,s,i) ( ((k & OpndKind_Any)<<24) | ((s & OpndSize_Any)<<16) | (i&0xFF) )
+
+// Gregory -
+// It is critical that all register indexes (3rd number) inside of the
+// following table go in ascending order. That is R8 goes after
+// RDI. It is necessary for decoder when extending registers from RAX-RDI
+// to R8-R15 by simply adding 8 to the index on EM64T architecture
+typedef enum RegName {
+
+    RegName_Null = 0,
+
+#ifdef _EM64T_
+    /*
+    An index part of the RegName-s for RAX-RDI, EAX-ESI, AX-SI and AL-BH is
+    the same as the index used during instructions encoding. The same rule
+    applies for XMM regsters for IA32.
+    For new EM64T registers (both GP and XMM) the index need to be corrected to
+    obtain the index used in processor's instructions.
+    */
+    RegName_RAX = REGNAME(OpndKind_GPReg,OpndSize_64,0),
+    RegName_RCX = REGNAME(OpndKind_GPReg,OpndSize_64,1),
+    RegName_RDX = REGNAME(OpndKind_GPReg,OpndSize_64,2),
+    RegName_RBX = REGNAME(OpndKind_GPReg,OpndSize_64,3),
+    RegName_RSP = REGNAME(OpndKind_GPReg,OpndSize_64,4),
+    RegName_RBP = REGNAME(OpndKind_GPReg,OpndSize_64,5),
+    RegName_RSI = REGNAME(OpndKind_GPReg,OpndSize_64,6),
+    RegName_RDI = REGNAME(OpndKind_GPReg,OpndSize_64,7),
+
+    RegName_R8  = REGNAME(OpndKind_GPReg,OpndSize_64,8),
+    RegName_R9  = REGNAME(OpndKind_GPReg,OpndSize_64,9),
+    RegName_R10 = REGNAME(OpndKind_GPReg,OpndSize_64,10),
+    RegName_R11 = REGNAME(OpndKind_GPReg,OpndSize_64,11),
+    RegName_R12 = REGNAME(OpndKind_GPReg,OpndSize_64,12),
+    RegName_R13 = REGNAME(OpndKind_GPReg,OpndSize_64,13),
+    RegName_R14 = REGNAME(OpndKind_GPReg,OpndSize_64,14),
+    RegName_R15 = REGNAME(OpndKind_GPReg,OpndSize_64,15),
+#endif //~_EM64T_
+
+    RegName_EAX=REGNAME(OpndKind_GPReg,OpndSize_32,0),
+    RegName_ECX=REGNAME(OpndKind_GPReg,OpndSize_32,1),
+    RegName_EDX=REGNAME(OpndKind_GPReg,OpndSize_32,2),
+    RegName_EBX=REGNAME(OpndKind_GPReg,OpndSize_32,3),
+    RegName_ESP=REGNAME(OpndKind_GPReg,OpndSize_32,4),
+    RegName_EBP=REGNAME(OpndKind_GPReg,OpndSize_32,5),
+    RegName_ESI=REGNAME(OpndKind_GPReg,OpndSize_32,6),
+    RegName_EDI=REGNAME(OpndKind_GPReg,OpndSize_32,7),
+
+#ifdef _EM64T_
+    RegName_R8D  = REGNAME(OpndKind_GPReg,OpndSize_32,8),
+    RegName_R9D  = REGNAME(OpndKind_GPReg,OpndSize_32,9),
+    RegName_R10D = REGNAME(OpndKind_GPReg,OpndSize_32,10),
+    RegName_R11D = REGNAME(OpndKind_GPReg,OpndSize_32,11),
+    RegName_R12D = REGNAME(OpndKind_GPReg,OpndSize_32,12),
+    RegName_R13D = REGNAME(OpndKind_GPReg,OpndSize_32,13),
+    RegName_R14D = REGNAME(OpndKind_GPReg,OpndSize_32,14),
+    RegName_R15D = REGNAME(OpndKind_GPReg,OpndSize_32,15),
+#endif //~_EM64T_
+
+    RegName_AX=REGNAME(OpndKind_GPReg,OpndSize_16,0),
+    RegName_CX=REGNAME(OpndKind_GPReg,OpndSize_16,1),
+    RegName_DX=REGNAME(OpndKind_GPReg,OpndSize_16,2),
+    RegName_BX=REGNAME(OpndKind_GPReg,OpndSize_16,3),
+    RegName_SP=REGNAME(OpndKind_GPReg,OpndSize_16,4),
+    RegName_BP=REGNAME(OpndKind_GPReg,OpndSize_16,5),
+    RegName_SI=REGNAME(OpndKind_GPReg,OpndSize_16,6),
+    RegName_DI=REGNAME(OpndKind_GPReg,OpndSize_16,7),
+
+#ifdef _EM64T_
+    RegName_R8S  = REGNAME(OpndKind_GPReg,OpndSize_16,8),
+    RegName_R9S  = REGNAME(OpndKind_GPReg,OpndSize_16,9),
+    RegName_R10S = REGNAME(OpndKind_GPReg,OpndSize_16,10),
+    RegName_R11S = REGNAME(OpndKind_GPReg,OpndSize_16,11),
+    RegName_R12S = REGNAME(OpndKind_GPReg,OpndSize_16,12),
+    RegName_R13S = REGNAME(OpndKind_GPReg,OpndSize_16,13),
+    RegName_R14S = REGNAME(OpndKind_GPReg,OpndSize_16,14),
+    RegName_R15S = REGNAME(OpndKind_GPReg,OpndSize_16,15),
+#endif //~_EM64T_
+
+    RegName_AL=REGNAME(OpndKind_GPReg,OpndSize_8,0),
+    RegName_CL=REGNAME(OpndKind_GPReg,OpndSize_8,1),
+    RegName_DL=REGNAME(OpndKind_GPReg,OpndSize_8,2),
+    RegName_BL=REGNAME(OpndKind_GPReg,OpndSize_8,3),
+    // FIXME: Used in enc_tabl.cpp
+    // AH is not accessible on EM64T, instead encoded register is SPL, so decoded
+    // register will return incorrect enum
+    RegName_AH=REGNAME(OpndKind_GPReg,OpndSize_8,4),
+#if !defined(_EM64T_)
+    RegName_CH=REGNAME(OpndKind_GPReg,OpndSize_8,5),
+    RegName_DH=REGNAME(OpndKind_GPReg,OpndSize_8,6),
+    RegName_BH=REGNAME(OpndKind_GPReg,OpndSize_8,7),
+#else
+    RegName_SPL=REGNAME(OpndKind_GPReg,OpndSize_8,4),
+    RegName_BPL=REGNAME(OpndKind_GPReg,OpndSize_8,5),
+    RegName_SIL=REGNAME(OpndKind_GPReg,OpndSize_8,6),
+    RegName_DIL=REGNAME(OpndKind_GPReg,OpndSize_8,7),
+    RegName_R8L=REGNAME(OpndKind_GPReg,OpndSize_8,8),
+    RegName_R9L=REGNAME(OpndKind_GPReg,OpndSize_8,9),
+    RegName_R10L=REGNAME(OpndKind_GPReg,OpndSize_8,10),
+    RegName_R11L=REGNAME(OpndKind_GPReg,OpndSize_8,11),
+    RegName_R12L=REGNAME(OpndKind_GPReg,OpndSize_8,12),
+    RegName_R13L=REGNAME(OpndKind_GPReg,OpndSize_8,13),
+    RegName_R14L=REGNAME(OpndKind_GPReg,OpndSize_8,14),
+    RegName_R15L=REGNAME(OpndKind_GPReg,OpndSize_8,15),
+#endif
+
+    RegName_ES=REGNAME(OpndKind_SReg,OpndSize_16,0),
+    RegName_CS=REGNAME(OpndKind_SReg,OpndSize_16,1),
+    RegName_SS=REGNAME(OpndKind_SReg,OpndSize_16,2),
+    RegName_DS=REGNAME(OpndKind_SReg,OpndSize_16,3),
+    RegName_FS=REGNAME(OpndKind_SReg,OpndSize_16,4),
+    RegName_GS=REGNAME(OpndKind_SReg,OpndSize_16,5),
+
+    RegName_EFLAGS=REGNAME(OpndKind_StatusReg,OpndSize_32,0),
+
+#if !defined(TESTING_ENCODER)
+    RegName_FP0=REGNAME(OpndKind_FPReg,OpndSize_80,0),
+    RegName_FP1=REGNAME(OpndKind_FPReg,OpndSize_80,1),
+    RegName_FP2=REGNAME(OpndKind_FPReg,OpndSize_80,2),
+    RegName_FP3=REGNAME(OpndKind_FPReg,OpndSize_80,3),
+    RegName_FP4=REGNAME(OpndKind_FPReg,OpndSize_80,4),
+    RegName_FP5=REGNAME(OpndKind_FPReg,OpndSize_80,5),
+    RegName_FP6=REGNAME(OpndKind_FPReg,OpndSize_80,6),
+    RegName_FP7=REGNAME(OpndKind_FPReg,OpndSize_80,7),
+#endif
+    RegName_FP0S=REGNAME(OpndKind_FPReg,OpndSize_32,0),
+    RegName_FP1S=REGNAME(OpndKind_FPReg,OpndSize_32,1),
+    RegName_FP2S=REGNAME(OpndKind_FPReg,OpndSize_32,2),
+    RegName_FP3S=REGNAME(OpndKind_FPReg,OpndSize_32,3),
+    RegName_FP4S=REGNAME(OpndKind_FPReg,OpndSize_32,4),
+    RegName_FP5S=REGNAME(OpndKind_FPReg,OpndSize_32,5),
+    RegName_FP6S=REGNAME(OpndKind_FPReg,OpndSize_32,6),
+    RegName_FP7S=REGNAME(OpndKind_FPReg,OpndSize_32,7),
+
+    RegName_FP0D=REGNAME(OpndKind_FPReg,OpndSize_64,0),
+    RegName_FP1D=REGNAME(OpndKind_FPReg,OpndSize_64,1),
+    RegName_FP2D=REGNAME(OpndKind_FPReg,OpndSize_64,2),
+    RegName_FP3D=REGNAME(OpndKind_FPReg,OpndSize_64,3),
+    RegName_FP4D=REGNAME(OpndKind_FPReg,OpndSize_64,4),
+    RegName_FP5D=REGNAME(OpndKind_FPReg,OpndSize_64,5),
+    RegName_FP6D=REGNAME(OpndKind_FPReg,OpndSize_64,6),
+    RegName_FP7D=REGNAME(OpndKind_FPReg,OpndSize_64,7),
+
+#if !defined(TESTING_ENCODER)
+    RegName_XMM0=REGNAME(OpndKind_XMMReg,OpndSize_128,0),
+    RegName_XMM1=REGNAME(OpndKind_XMMReg,OpndSize_128,1),
+    RegName_XMM2=REGNAME(OpndKind_XMMReg,OpndSize_128,2),
+    RegName_XMM3=REGNAME(OpndKind_XMMReg,OpndSize_128,3),
+    RegName_XMM4=REGNAME(OpndKind_XMMReg,OpndSize_128,4),
+    RegName_XMM5=REGNAME(OpndKind_XMMReg,OpndSize_128,5),
+    RegName_XMM6=REGNAME(OpndKind_XMMReg,OpndSize_128,6),
+    RegName_XMM7=REGNAME(OpndKind_XMMReg,OpndSize_128,7),
+
+#ifdef _EM64T_
+    RegName_XMM8  = REGNAME(OpndKind_XMMReg,OpndSize_128,0),
+    RegName_XMM9  = REGNAME(OpndKind_XMMReg,OpndSize_128,1),
+    RegName_XMM10 = REGNAME(OpndKind_XMMReg,OpndSize_128,2),
+    RegName_XMM11 = REGNAME(OpndKind_XMMReg,OpndSize_128,3),
+    RegName_XMM12 = REGNAME(OpndKind_XMMReg,OpndSize_128,4),
+    RegName_XMM13 = REGNAME(OpndKind_XMMReg,OpndSize_128,5),
+    RegName_XMM14 = REGNAME(OpndKind_XMMReg,OpndSize_128,6),
+    RegName_XMM15 = REGNAME(OpndKind_XMMReg,OpndSize_128,7),
+#endif //~_EM64T_
+
+#endif  // ~TESTING_ENCODER
+
+    RegName_XMM0S=REGNAME(OpndKind_XMMReg,OpndSize_32,0),
+    RegName_XMM1S=REGNAME(OpndKind_XMMReg,OpndSize_32,1),
+    RegName_XMM2S=REGNAME(OpndKind_XMMReg,OpndSize_32,2),
+    RegName_XMM3S=REGNAME(OpndKind_XMMReg,OpndSize_32,3),
+    RegName_XMM4S=REGNAME(OpndKind_XMMReg,OpndSize_32,4),
+    RegName_XMM5S=REGNAME(OpndKind_XMMReg,OpndSize_32,5),
+    RegName_XMM6S=REGNAME(OpndKind_XMMReg,OpndSize_32,6),
+    RegName_XMM7S=REGNAME(OpndKind_XMMReg,OpndSize_32,7),
+#ifdef _EM64T_
+    RegName_XMM8S=REGNAME(OpndKind_XMMReg,OpndSize_32,8),
+    RegName_XMM9S=REGNAME(OpndKind_XMMReg,OpndSize_32,9),
+    RegName_XMM10S=REGNAME(OpndKind_XMMReg,OpndSize_32,10),
+    RegName_XMM11S=REGNAME(OpndKind_XMMReg,OpndSize_32,11),
+    RegName_XMM12S=REGNAME(OpndKind_XMMReg,OpndSize_32,12),
+    RegName_XMM13S=REGNAME(OpndKind_XMMReg,OpndSize_32,13),
+    RegName_XMM14S=REGNAME(OpndKind_XMMReg,OpndSize_32,14),
+    RegName_XMM15S=REGNAME(OpndKind_XMMReg,OpndSize_32,15),
+#endif // ifdef _EM64T_
+    RegName_XMM0D=REGNAME(OpndKind_XMMReg,OpndSize_64,0),
+    RegName_XMM1D=REGNAME(OpndKind_XMMReg,OpndSize_64,1),
+    RegName_XMM2D=REGNAME(OpndKind_XMMReg,OpndSize_64,2),
+    RegName_XMM3D=REGNAME(OpndKind_XMMReg,OpndSize_64,3),
+    RegName_XMM4D=REGNAME(OpndKind_XMMReg,OpndSize_64,4),
+    RegName_XMM5D=REGNAME(OpndKind_XMMReg,OpndSize_64,5),
+    RegName_XMM6D=REGNAME(OpndKind_XMMReg,OpndSize_64,6),
+    RegName_XMM7D=REGNAME(OpndKind_XMMReg,OpndSize_64,7),
+#ifdef _EM64T_
+    RegName_XMM8D=REGNAME(OpndKind_XMMReg,OpndSize_64,8),
+    RegName_XMM9D=REGNAME(OpndKind_XMMReg,OpndSize_64,9),
+    RegName_XMM10D=REGNAME(OpndKind_XMMReg,OpndSize_64,10),
+    RegName_XMM11D=REGNAME(OpndKind_XMMReg,OpndSize_64,11),
+    RegName_XMM12D=REGNAME(OpndKind_XMMReg,OpndSize_64,12),
+    RegName_XMM13D=REGNAME(OpndKind_XMMReg,OpndSize_64,13),
+    RegName_XMM14D=REGNAME(OpndKind_XMMReg,OpndSize_64,14),
+    RegName_XMM15D=REGNAME(OpndKind_XMMReg,OpndSize_64,15),
+#endif // ifdef _EM64T_
+#ifdef _HAVE_MMX_
+    RegName_MMX0=REGNAME(OpndKind_MMXReg,OpndSize_64,0),
+    RegName_MMX1=REGNAME(OpndKind_MMXReg,OpndSize_64,1),
+    RegName_MMX2=REGNAME(OpndKind_MMXReg,OpndSize_64,2),
+    RegName_MMX3=REGNAME(OpndKind_MMXReg,OpndSize_64,3),
+    RegName_MMX4=REGNAME(OpndKind_MMXReg,OpndSize_64,4),
+    RegName_MMX5=REGNAME(OpndKind_MMXReg,OpndSize_64,5),
+    RegName_MMX6=REGNAME(OpndKind_MMXReg,OpndSize_64,6),
+    RegName_MMX7=REGNAME(OpndKind_MMXReg,OpndSize_64,7),
+#endif  // _HAVE_MMX_
+} RegName;
+
+#if 0   // Android x86: use mnemonics defined in enc_defs_ext.h
+/**
+ * Conditional mnemonics.
+ * The values match the 'real' (==processor's) values of the appropriate
+ * condition values used in the opcodes.
+ */
+enum ConditionMnemonic {
+
+    ConditionMnemonic_O=0,
+    ConditionMnemonic_NO=1,
+    ConditionMnemonic_B=2, ConditionMnemonic_NAE=ConditionMnemonic_B, ConditionMnemonic_C=ConditionMnemonic_B,
+    ConditionMnemonic_NB=3, ConditionMnemonic_AE=ConditionMnemonic_NB, ConditionMnemonic_NC=ConditionMnemonic_NB,
+    ConditionMnemonic_Z=4, ConditionMnemonic_E=ConditionMnemonic_Z,
+    ConditionMnemonic_NZ=5, ConditionMnemonic_NE=ConditionMnemonic_NZ,
+    ConditionMnemonic_BE=6, ConditionMnemonic_NA=ConditionMnemonic_BE,
+    ConditionMnemonic_NBE=7, ConditionMnemonic_A=ConditionMnemonic_NBE,
+
+    ConditionMnemonic_S=8,
+    ConditionMnemonic_NS=9,
+    ConditionMnemonic_P=10, ConditionMnemonic_PE=ConditionMnemonic_P,
+    ConditionMnemonic_NP=11, ConditionMnemonic_PO=ConditionMnemonic_NP,
+    ConditionMnemonic_L=12, ConditionMnemonic_NGE=ConditionMnemonic_L,
+    ConditionMnemonic_NL=13, ConditionMnemonic_GE=ConditionMnemonic_NL,
+    ConditionMnemonic_LE=14, ConditionMnemonic_NG=ConditionMnemonic_LE,
+    ConditionMnemonic_NLE=15, ConditionMnemonic_G=ConditionMnemonic_NLE,
+    ConditionMnemonic_Count=16
+};
+
+
+#define CCM(prefix,cond) Mnemonic_##prefix##cond=Mnemonic_##prefix##cc+ConditionMnemonic_##cond
+
+//=========================================================================================================
+enum Mnemonic {
+
+Mnemonic_NULL=0, Mnemonic_Null=Mnemonic_NULL,
+Mnemonic_ADC,                           // Add with Carry
+Mnemonic_ADD,                           // Add
+Mnemonic_ADDSD,                         // Add Scalar Double-Precision Floating-Point Values
+Mnemonic_ADDSS,                         // Add Scalar Single-Precision Floating-Point Values
+Mnemonic_AND,                           // Logical AND
+
+Mnemonic_BSF,                           // Bit scan forward
+Mnemonic_BSR,                           // Bit scan reverse
+
+Mnemonic_CALL,                          // Call Procedure
+Mnemonic_CMC,                           // Complement Carry Flag
+Mnemonic_CWD, Mnemonic_CDQ=Mnemonic_CWD,// Convert Word to Doubleword/Convert Doubleword to Qua T dword
+Mnemonic_CMOVcc,                        // Conditional Move
+    CCM(CMOV,O),
+    CCM(CMOV,NO),
+    CCM(CMOV,B), CCM(CMOV,NAE), CCM(CMOV,C),
+    CCM(CMOV,NB), CCM(CMOV,AE), CCM(CMOV,NC),
+    CCM(CMOV,Z), CCM(CMOV,E),
+    CCM(CMOV,NZ), CCM(CMOV,NE),
+    CCM(CMOV,BE), CCM(CMOV,NA),
+    CCM(CMOV,NBE), CCM(CMOV,A),
+
+    CCM(CMOV,S),
+    CCM(CMOV,NS),
+    CCM(CMOV,P), CCM(CMOV,PE),
+    CCM(CMOV,NP), CCM(CMOV,PO),
+    CCM(CMOV,L), CCM(CMOV,NGE),
+    CCM(CMOV,NL), CCM(CMOV,GE),
+    CCM(CMOV,LE), CCM(CMOV,NG),
+    CCM(CMOV,NLE), CCM(CMOV,G),
+
+Mnemonic_CMP,                           // Compare Two Operands
+Mnemonic_CMPXCHG,                       // Compare and exchange
+Mnemonic_CMPXCHG8B,                     // Compare and Exchange 8 Bytes
+Mnemonic_CMPSB,                         // Compare Two Bytes at DS:ESI and ES:EDI
+Mnemonic_CMPSW,                         // Compare Two Words at DS:ESI and ES:EDI
+Mnemonic_CMPSD,                        // Compare Two Doublewords at DS:ESI and ES:EDI
+//
+// double -> float
+Mnemonic_CVTSD2SS,                      // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
+// double -> I_32
+Mnemonic_CVTSD2SI,                      // Convert Scalar Double-Precision Floating-Point Value to Doubleword Integer
+// double [truncated] -> I_32
+Mnemonic_CVTTSD2SI,                     // Convert with Truncation Scalar Double-Precision Floating-Point Value to Signed Doubleword Integer
+//
+// float -> double
+Mnemonic_CVTSS2SD,                      // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
+// float -> I_32
+Mnemonic_CVTSS2SI,                      // Convert Scalar Single-Precision Floating-Point Value to Doubleword Integer
+// float [truncated] -> I_32
+Mnemonic_CVTTSS2SI,                     // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
+//
+// I_32 -> double
+Mnemonic_CVTSI2SD,                      // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
+// I_32 -> float
+Mnemonic_CVTSI2SS,                      // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
+
+Mnemonic_COMISD,                        // Compare Scalar Ordered Double-Precision Floating-Point Values and Set EFLAGS
+Mnemonic_COMISS,                        // Compare Scalar Ordered Single-Precision Floating-Point Values and Set EFLAGS
+Mnemonic_DEC,                           // Decrement by 1
+//Mnemonic_DIV,                         // Unsigned Divide
+Mnemonic_DIVSD,                         // Divide Scalar Double-Precision Floating-Point Values
+Mnemonic_DIVSS,                         // Divide Scalar Single-Precision Floating-Point Values
+
+#ifdef _HAVE_MMX_
+Mnemonic_EMMS,                          // Empty MMX Technology State
+#endif
+
+Mnemonic_ENTER,                         // ENTER-Make Stack Frame for Procedure Parameters
+Mnemonic_FLDCW,                         // Load FPU control word
+Mnemonic_FADDP,
+Mnemonic_FLDZ,
+Mnemonic_FADD,
+Mnemonic_FSUBP,
+Mnemonic_FSUB,
+Mnemonic_FISUB,
+Mnemonic_FMUL,
+Mnemonic_FMULP,
+Mnemonic_FDIVP,
+Mnemonic_FDIV,
+Mnemonic_FUCOMPP,
+Mnemonic_FRNDINT,
+Mnemonic_FNSTCW,                        // Store FPU control word
+Mnemonic_FSTSW,                         // Store FPU status word
+Mnemonic_FNSTSW,                         // Store FPU status word
+//Mnemonic_FDECSTP,                     // Decrement Stack-Top Pointer
+Mnemonic_FILD,                          // Load Integer
+Mnemonic_FLD,                           // Load Floating Point Value
+Mnemonic_FLDLG2,
+Mnemonic_FLDLN2,
+Mnemonic_FLD1,
+
+Mnemonic_FCLEX,                         // Clear Exceptions
+Mnemonic_FCHS,                          // Change sign of ST0
+Mnemonic_FNCLEX,                        // Clear Exceptions
+
+//Mnemonic_FINCSTP,                     // Increment Stack-Top Pointer
+Mnemonic_FIST,                          // Store Integer
+Mnemonic_FISTP,                         // Store Integer, pop FPU stack
+Mnemonic_FISTTP,                        // Store Integer with Truncation
+Mnemonic_FPREM,                         // Partial Remainder
+Mnemonic_FPREM1,                        // Partial Remainder
+Mnemonic_FST,                           // Store Floating Point Value
+Mnemonic_FSTP,                          // Store Floating Point Value and pop the FP stack
+Mnemonic_FSQRT,                         //Computes the square root of the source value in the stack and pop the FP stack
+Mnemonic_FABS,                          //Computes the absolute value of the source value in the stack and pop the FP stack
+Mnemonic_FSIN,                          //Computes the sine of the source value in the stack and pop the FP stack
+Mnemonic_FCOS,                          //Computes the cosine of the source value in the stack and pop the FP stack
+Mnemonic_FPTAN,                         //Computes the tangent of the source value in the stack and pop the FP stack
+Mnemonic_FYL2X,
+Mnemonic_FYL2XP1,
+Mnemonic_F2XM1,
+Mnemonic_FPATAN,
+Mnemonic_FXCH,
+Mnemonic_FSCALE,
+
+Mnemonic_XCHG,
+Mnemonic_DIV,                           // Unsigned Divide
+Mnemonic_IDIV,                          // Signed Divide
+Mnemonic_MUL,                           // Unsigned Multiply
+Mnemonic_IMUL,                          // Signed Multiply
+Mnemonic_INC,                           // Increment by 1
+Mnemonic_INT3,                          // Call break point
+Mnemonic_Jcc,                           // Jump if Condition Is Met
+    CCM(J,O),
+    CCM(J,NO),
+    CCM(J,B), CCM(J,NAE), CCM(J,C),
+    CCM(J,NB), CCM(J,AE), CCM(J,NC),
+    CCM(J,Z), CCM(J,E),
+    CCM(J,NZ), CCM(J,NE),
+    CCM(J,BE), CCM(J,NA),
+    CCM(J,NBE), CCM(J,A),
+    CCM(J,S),
+    CCM(J,NS),
+    CCM(J,P), CCM(J,PE),
+    CCM(J,NP), CCM(J,PO),
+    CCM(J,L), CCM(J,NGE),
+    CCM(J,NL), CCM(J,GE),
+    CCM(J,LE), CCM(J,NG),
+    CCM(J,NLE), CCM(J,G),
+Mnemonic_JMP,                           // Jump
+Mnemonic_LEA,                           // Load Effective Address
+Mnemonic_LEAVE,                         // High Level Procedure Exit
+Mnemonic_LOOP,                          // Loop according to ECX counter
+Mnemonic_LOOPE,                          // Loop according to ECX counter
+Mnemonic_LOOPNE, Mnemonic_LOOPNZ = Mnemonic_LOOPNE, // Loop according to ECX
+Mnemonic_LAHF,                          // Load Flags into AH
+Mnemonic_MOV,                           // Move
+Mnemonic_MOVD,                          // Move Double word
+Mnemonic_MOVQ,                          // Move Quadword
+/*Mnemonic_MOVS,                        // Move Data from String to String*/
+// MOVS is a special case: see encoding table for more details,
+Mnemonic_MOVS8, Mnemonic_MOVS16, Mnemonic_MOVS32, Mnemonic_MOVS64,
+//
+Mnemonic_MOVAPD,                         // Move Scalar Double-Precision Floating-Point Value
+Mnemonic_MOVSD,                         // Move Scalar Double-Precision Floating-Point Value
+Mnemonic_MOVSS,                         // Move Scalar Single-Precision Floating-Point Values
+Mnemonic_MOVSX,                         // Move with Sign-Extension
+Mnemonic_MOVZX,                         // Move with Zero-Extend
+//Mnemonic_MUL,                         // Unsigned Multiply
+Mnemonic_MULSD,                         // Multiply Scalar Double-Precision Floating-Point Values
+Mnemonic_MULSS,                         // Multiply Scalar Single-Precision Floating-Point Values
+Mnemonic_NEG,                           // Two's Complement Negation
+Mnemonic_NOP,                           // No Operation
+Mnemonic_NOT,                           // One's Complement Negation
+Mnemonic_OR,                            // Logical Inclusive OR
+Mnemonic_PREFETCH,                      // prefetch
+
+#ifdef _HAVE_MMX_
+    Mnemonic_PADDQ,                     // Add Packed Quadword Integers
+    Mnemonic_PAND,                      // Logical AND
+    Mnemonic_POR,                       // Bitwise Logical OR
+    Mnemonic_PSUBQ,                     // Subtract Packed Quadword Integers
+#endif
+
+Mnemonic_PXOR,                          // Logical Exclusive OR
+Mnemonic_POP,                           // Pop a Value from the Stack
+Mnemonic_POPFD,                         // Pop a Value of EFLAGS register from the Stack
+Mnemonic_PUSH,                          // Push Word or Doubleword Onto the Stack
+Mnemonic_PUSHFD,                        // Push EFLAGS Doubleword Onto the Stack
+Mnemonic_RET,                           // Return from Procedure
+
+Mnemonic_SETcc,                         // Set Byte on Condition
+    CCM(SET,O),
+    CCM(SET,NO),
+    CCM(SET,B), CCM(SET,NAE), CCM(SET,C),
+    CCM(SET,NB), CCM(SET,AE), CCM(SET,NC),
+    CCM(SET,Z), CCM(SET,E),
+    CCM(SET,NZ), CCM(SET,NE),
+    CCM(SET,BE), CCM(SET,NA),
+    CCM(SET,NBE), CCM(SET,A),
+    CCM(SET,S),
+    CCM(SET,NS),
+    CCM(SET,P), CCM(SET,PE),
+    CCM(SET,NP), CCM(SET,PO),
+    CCM(SET,L), CCM(SET,NGE),
+    CCM(SET,NL), CCM(SET,GE),
+    CCM(SET,LE), CCM(SET,NG),
+    CCM(SET,NLE), CCM(SET,G),
+
+Mnemonic_SAL, Mnemonic_SHL=Mnemonic_SAL,// Shift left
+Mnemonic_SAR,                           // Shift right
+Mnemonic_ROR,                           // Rotate right
+Mnemonic_RCR,                           // Rotate right through CARRY flag
+Mnemonic_ROL,                           // Rotate left
+Mnemonic_RCL,                           // Rotate left through CARRY flag
+Mnemonic_SHR,                           // Unsigned shift right
+Mnemonic_SHRD,                          // Double Precision Shift Right
+Mnemonic_SHLD,                          // Double Precision Shift Left
+
+Mnemonic_SBB,                           // Integer Subtraction with Borrow
+Mnemonic_SUB,                           // Subtract
+Mnemonic_SUBSD,                         // Subtract Scalar Double-Precision Floating-Point Values
+Mnemonic_SUBSS,                         // Subtract Scalar Single-Precision Floating-Point Values
+
+Mnemonic_TEST,                          // Logical Compare
+
+Mnemonic_UCOMISD,                       // Unordered Compare Scalar Double-Precision Floating-Point Values and Set EFLAGS
+Mnemonic_UCOMISS,                       // Unordered Compare Scalar Single-Precision Floating-Point Values and Set EFLAGS
+
+Mnemonic_XOR,                           // Logical Exclusive OR
+//
+// packed things,
+//
+Mnemonic_XORPD,                         // Bitwise Logical XOR for Double-Precision Floating-Point Values
+Mnemonic_XORPS,                         // Bitwise Logical XOR for Single-Precision Floating-Point Values
+
+Mnemonic_CVTDQ2PD,                      // Convert Packed Doubleword Integers to Packed Double-Precision Floating-Point Values
+Mnemonic_CVTTPD2DQ,                     // Convert with Truncation Packed Double-Precision Floating-Point Values to Packed Doubleword Integers
+
+Mnemonic_CVTDQ2PS,                      // Convert Packed Doubleword Integers to Packed Single-Precision Floating-Point Values
+Mnemonic_CVTTPS2DQ,                     // Convert with Truncation Packed Single-Precision Floating-Point Values to Packed Doubleword Integers
+//
+// String operations
+//
+Mnemonic_STD,                           // Set direction flag
+Mnemonic_CLD,                           // Clear direction flag
+Mnemonic_SCAS,                          // Scan string
+Mnemonic_STOS,                          // Store string
+
+//
+Mnemonic_WAIT,                          // Check pending pending unmasked floating-point exception
+//
+Mnemonic_Count
+};
+
+#undef CCM
+#endif
+
+/**
+ * @brief Instruction prefixes, according to arch manual.
+ */
+typedef enum InstPrefix {
+    InstPrefix_Null = 0,
+    // Group 1
+    InstPrefix_LOCK = 0xF0,
+    InstPrefix_REPNE = 0xF2,
+    InstPrefix_REPNZ = InstPrefix_REPNE,
+    InstPrefix_REP = 0xF3, InstPrefix_REPZ = InstPrefix_REP,
+    // Group 2
+    InstPrefix_CS = 0x2E,
+    InstPrefix_SS = 0x36,
+    InstPrefix_DS = 0x3E,
+    InstPrefix_ES = 0x26,
+    InstPrefix_FS = 0x64,
+    InstPrefix_GS = 0x65,
+    //
+    InstPrefix_HintTaken = 0x3E,
+    InstPrefix_HintNotTaken = 0x2E,
+    // Group 3
+    InstPrefix_OpndSize = 0x66,
+    // Group 4
+    InstPrefix_AddrSize = 0x67
+} InstPrefix;
+
+inline unsigned getSizeBytes(OpndSize sz)
+{
+    if (sz==OpndSize_64) { return 8; }
+    if (sz==OpndSize_32) { return 4; }
+    if (sz==OpndSize_16) { return 2; }
+    if (sz==OpndSize_8)  { return 1; }
+    assert(false);
+    return 0;
+}
+
+inline bool isRegKind(OpndKind kind)
+{
+    return OpndKind_GPReg<= kind && kind<=OpndKind_MaxRegKind;
+}
+
+/**
+ * @brief Returns RegName for a given name.
+ *
+ * Name is case-insensitive.
+ * @param regname - string name of a register
+ * @return RegName for the given name, or RegName_Null if name is invalid
+ */
+RegName         getRegName(const char * regname);
+/**
+ * Constructs RegName from the given OpndKind, size and index.
+ */
+inline RegName  getRegName(OpndKind k, OpndSize s, int idx)
+{
+    return (RegName)REGNAME(k,s,idx);
+}
+/**
+ * Extracts a bit mask with a bit set at the position of the register's index.
+ */
+inline unsigned getRegMask(RegName reg)
+{
+    return 1<<(reg&0xff);
+}
+/**
+ * @brief Extracts OpndKind from the RegName.
+ */
+inline OpndKind getRegKind(RegName reg)
+{
+    return (OpndKind)(reg>>24);
+}
+/**
+ * @brief Extracts OpndSize from RegName.
+ */
+inline OpndSize getRegSize(RegName reg)
+{
+    return (OpndSize)((reg>>16)&0xFF);
+}
+/**
+ * Extracts an index from the given RegName.
+ */
+inline unsigned char getRegIndex(RegName reg)
+{
+    return (unsigned char)(reg&0xFF);
+}
+/**
+ * Returns a string name of the given RegName. The name returned is in upper-case.
+ * Returns NULL if invalid RegName specified.
+ */
+const char *    getRegNameString(RegName reg);
+/**
+ * Returns string name of a given OpndSize.
+ * Returns NULL if invalid OpndSize passed.
+ */
+const char *    getOpndSizeString(OpndSize size);
+/**
+ * Returns OpndSize passed by its string representation (case insensitive).
+ * Returns OpndSize_Null if invalid string specified.
+ * The 'sizeString' can not be NULL.
+ */
+OpndSize        getOpndSize(const char * sizeString);
+/**
+ * Returns string name of a given OpndKind.
+ * Returns NULL if the passed kind is invalid.
+ */
+const char *    getOpndKindString(OpndKind kind);
+/**
+ * Returns OpndKind found by its string representation (case insensitive).
+ * Returns OpndKind_Null if the name is invalid.
+ * The 'kindString' can not be NULL.
+ */
+OpndKind        getOpndKind(const char * kindString);
+/**
+ *
+ */
+const char *    getConditionString(ConditionMnemonic cm);
+
+/**
+ * Constructs an RegName with the same index and kind, but with a different size from
+ * the given RegName (i.e. getRegAlias(EAX, OpndSize_16) => AX; getRegAlias(BL, OpndSize_32) => EBX).
+ * The constructed RegName is not checked in any way and thus may be invalid.
+ * Note, that the aliasing does not work for at least AH,BH,CH,DH, ESI, EDI, ESP and EBP regs.
+ */
+inline RegName getAliasReg(RegName reg, OpndSize sz)
+{
+    return (RegName)REGNAME(getRegKind(reg), sz, getRegIndex(reg));
+}
+
+/**
+ * brief Tests two RegName-s of the same kind for equality.
+ *
+ * @note Does work for 8 bit general purpose registers (AH, AL, BH, BL, etc).
+ */
+inline bool equals(RegName r0, RegName r1)
+{
+    return getRegKind(r0) == getRegKind(r1) &&
+           getRegIndex(r0) == getRegIndex(r1);
+}
+
+ENCODER_NAMESPACE_END
+
+#endif  // ifndef _ENCODER_DEFS_H_
diff --git a/libpixelflinger/codeflinger/x86/libenc/enc_defs_ext.h b/libpixelflinger/codeflinger/x86/libenc/enc_defs_ext.h
new file mode 100644
index 0000000..53f6d44
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/enc_defs_ext.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ENCODER_DEFS_EXT_H_
+#define _ENCODER_DEFS_EXT_H_
+
+
+// Used to isolate experimental or being tuned encoder into a separate
+// namespace so it can coexist with a stable one in the same bundle.
+#ifdef ENCODER_ISOLATE
+    #define ENCODER_NAMESPACE_START namespace enc_ia32 {
+    #define ENCODER_NAMESPACE_END };
+#else
+    #define ENCODER_NAMESPACE_START
+    #define ENCODER_NAMESPACE_END
+#endif
+
+ENCODER_NAMESPACE_START
+typedef enum OpndSize {
+    /**
+     * A change must be balanced with at least the following places:
+     *              Ia32IRConstants.h :: getByteSize() uses some presumptions about OpndSize_ values
+     *              Ia32::Constraint-s use the OpndSize as a mask
+     *              encoder.cpp & encoder_master_info.cpp uses OpndSize as an index for hashing
+     *              - perhaps there are much more places
+     */
+    OpndSize_Null           = 0,
+    OpndSize_8             = 0x01,
+    OpndSize_16            = 0x02,
+    OpndSize_32            = 0x04,
+    OpndSize_64            = 0x08,
+#if !defined(TESTING_ENCODER)
+    OpndSize_80            = 0x10,
+    OpndSize_128           = 0x20,
+#endif
+    OpndSize_Max,
+    OpndSize_Any            = 0x3F,
+    OpndSize_Default        = OpndSize_Any
+} OpndSize;
+
+/**
+ * Conditional mnemonics.
+ * The values match the 'real' (==processor's) values of the appropriate
+ * condition values used in the opcodes.
+ */
+typedef enum ConditionMnemonic {
+
+    ConditionMnemonic_O=0,
+    ConditionMnemonic_NO=1,
+    ConditionMnemonic_B=2, ConditionMnemonic_NAE=ConditionMnemonic_B, ConditionMnemonic_C=ConditionMnemonic_B,
+    ConditionMnemonic_NB=3, ConditionMnemonic_AE=ConditionMnemonic_NB, ConditionMnemonic_NC=ConditionMnemonic_NB,
+    ConditionMnemonic_Z=4, ConditionMnemonic_E=ConditionMnemonic_Z,
+    ConditionMnemonic_NZ=5, ConditionMnemonic_NE=ConditionMnemonic_NZ,
+    ConditionMnemonic_BE=6, ConditionMnemonic_NA=ConditionMnemonic_BE,
+    ConditionMnemonic_NBE=7, ConditionMnemonic_A=ConditionMnemonic_NBE,
+
+    ConditionMnemonic_S=8,
+    ConditionMnemonic_NS=9,
+    ConditionMnemonic_P=10, ConditionMnemonic_PE=ConditionMnemonic_P,
+    ConditionMnemonic_NP=11, ConditionMnemonic_PO=ConditionMnemonic_NP,
+    ConditionMnemonic_L=12, ConditionMnemonic_NGE=ConditionMnemonic_L,
+    ConditionMnemonic_NL=13, ConditionMnemonic_GE=ConditionMnemonic_NL,
+    ConditionMnemonic_LE=14, ConditionMnemonic_NG=ConditionMnemonic_LE,
+    ConditionMnemonic_NLE=15, ConditionMnemonic_G=ConditionMnemonic_NLE,
+    ConditionMnemonic_Count=16
+} ConditionMnemonic;
+
+
+#define CCM(prefix,cond) Mnemonic_##prefix##cond=Mnemonic_##prefix##cc+ConditionMnemonic_##cond
+
+//=========================================================================================================
+typedef enum Mnemonic {
+
+Mnemonic_NULL=0, Mnemonic_Null=Mnemonic_NULL,
+Mnemonic_JMP,                           // Jump
+Mnemonic_MOV,                           // Move
+Mnemonic_Jcc,                           // Jump if Condition Is Met
+    CCM(J,O),
+    CCM(J,NO),
+    CCM(J,B), CCM(J,NAE), CCM(J,C),
+    CCM(J,NB), CCM(J,AE), CCM(J,NC),
+    CCM(J,Z), CCM(J,E),
+    CCM(J,NZ), CCM(J,NE),
+    CCM(J,BE), CCM(J,NA),
+    CCM(J,NBE), CCM(J,A),
+    CCM(J,S),
+    CCM(J,NS),
+    CCM(J,P), CCM(J,PE),
+    CCM(J,NP), CCM(J,PO),
+    CCM(J,L), CCM(J,NGE),
+    CCM(J,NL), CCM(J,GE),
+    CCM(J,LE), CCM(J,NG),
+    CCM(J,NLE), CCM(J,G),
+Mnemonic_CALL,                          // Call Procedure
+
+Mnemonic_ADC,                           // Add with Carry
+Mnemonic_ADD,                           // Add
+Mnemonic_ADDSD,                         // Add Scalar Double-Precision Floating-Point Values
+Mnemonic_ADDSS,                         // Add Scalar Single-Precision Floating-Point Values
+Mnemonic_AND,                           // Logical AND
+
+Mnemonic_BSF,                           // Bit scan forward
+Mnemonic_BSR,                           // Bit scan reverse
+
+Mnemonic_CMC,                           // Complement Carry Flag
+Mnemonic_CWD, Mnemonic_CDQ=Mnemonic_CWD,// Convert Word to Doubleword/Convert Doubleword to Qua T dword
+Mnemonic_CMOVcc,                        // Conditional Move
+    CCM(CMOV,O),
+    CCM(CMOV,NO),
+    CCM(CMOV,B), CCM(CMOV,NAE), CCM(CMOV,C),
+    CCM(CMOV,NB), CCM(CMOV,AE), CCM(CMOV,NC),
+    CCM(CMOV,Z), CCM(CMOV,E),
+    CCM(CMOV,NZ), CCM(CMOV,NE),
+    CCM(CMOV,BE), CCM(CMOV,NA),
+    CCM(CMOV,NBE), CCM(CMOV,A),
+
+    CCM(CMOV,S),
+    CCM(CMOV,NS),
+    CCM(CMOV,P), CCM(CMOV,PE),
+    CCM(CMOV,NP), CCM(CMOV,PO),
+    CCM(CMOV,L), CCM(CMOV,NGE),
+    CCM(CMOV,NL), CCM(CMOV,GE),
+    CCM(CMOV,LE), CCM(CMOV,NG),
+    CCM(CMOV,NLE), CCM(CMOV,G),
+
+Mnemonic_CMP,                           // Compare Two Operands
+Mnemonic_CMPXCHG,                       // Compare and exchange
+Mnemonic_CMPXCHG8B,                     // Compare and Exchange 8 Bytes
+Mnemonic_CMPSB,                         // Compare Two Bytes at DS:ESI and ES:EDI
+Mnemonic_CMPSW,                         // Compare Two Words at DS:ESI and ES:EDI
+Mnemonic_CMPSD,                        // Compare Two Doublewords at DS:ESI and ES:EDI
+//
+// double -> float
+Mnemonic_CVTSD2SS,                      // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
+// double -> I_32
+Mnemonic_CVTSD2SI,                      // Convert Scalar Double-Precision Floating-Point Value to Doubleword Integer
+// double [truncated] -> I_32
+Mnemonic_CVTTSD2SI,                     // Convert with Truncation Scalar Double-Precision Floating-Point Value to Signed Doubleword Integer
+//
+// float -> double
+Mnemonic_CVTSS2SD,                      // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
+// float -> I_32
+Mnemonic_CVTSS2SI,                      // Convert Scalar Single-Precision Floating-Point Value to Doubleword Integer
+// float [truncated] -> I_32
+Mnemonic_CVTTSS2SI,                     // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
+//
+// I_32 -> double
+Mnemonic_CVTSI2SD,                      // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
+// I_32 -> float
+Mnemonic_CVTSI2SS,                      // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
+
+Mnemonic_COMISD,                        // Compare Scalar Ordered Double-Precision Floating-Point Values and Set EFLAGS
+Mnemonic_COMISS,                        // Compare Scalar Ordered Single-Precision Floating-Point Values and Set EFLAGS
+Mnemonic_DEC,                           // Decrement by 1
+Mnemonic_DIVSD,                         // Divide Scalar Double-Precision Floating-Point Values
+Mnemonic_DIVSS,                         // Divide Scalar Single-Precision Floating-Point Values
+Mnemonic_ENTER,                         // ENTER-Make Stack Frame for Procedure Parameters
+Mnemonic_FLDCW,                         // Load FPU control word
+Mnemonic_FADDP,
+Mnemonic_FLDZ,
+Mnemonic_FADD,
+Mnemonic_FSUBP,
+Mnemonic_FSUB,
+Mnemonic_FISUB,
+Mnemonic_FMUL,
+Mnemonic_FMULP,
+Mnemonic_FDIVP,
+Mnemonic_FDIV,
+Mnemonic_FUCOM,
+Mnemonic_FUCOMI,
+Mnemonic_FUCOMP,
+Mnemonic_FUCOMIP,
+Mnemonic_FUCOMPP,
+Mnemonic_FRNDINT,
+Mnemonic_FNSTCW,                        // Store FPU control word
+Mnemonic_FSTSW,                         // Store FPU status word
+Mnemonic_FNSTSW,                         // Store FPU status word
+Mnemonic_FILD,                          // Load Integer
+Mnemonic_FLD,                           // Load Floating Point Value
+Mnemonic_FLDLG2,
+Mnemonic_FLDLN2,
+Mnemonic_FLD1,
+
+Mnemonic_FCLEX,                         // Clear Exceptions
+Mnemonic_FCHS,                          // Change sign of ST0
+Mnemonic_FNCLEX,                        // Clear Exceptions
+Mnemonic_FIST,                          // Store Integer
+Mnemonic_FISTP,                         // Store Integer, pop FPU stack
+Mnemonic_FISTTP,                        // Store Integer with Truncation
+Mnemonic_FPREM,                         // Partial Remainder
+Mnemonic_FPREM1,                        // Partial Remainder
+Mnemonic_FST,                           // Store Floating Point Value
+Mnemonic_FSTP,                          // Store Floating Point Value and pop the FP stack
+Mnemonic_FSQRT,                         //Computes the square root of the source value in the stack and pop the FP stack
+Mnemonic_FABS,                          //Computes the absolute value of the source value in the stack and pop the FP stack
+Mnemonic_FSIN,                          //Computes the sine of the source value in the stack and pop the FP stack
+Mnemonic_FCOS,                          //Computes the cosine of the source value in the stack and pop the FP stack
+Mnemonic_FPTAN,                         //Computes the tangent of the source value in the stack and pop the FP stack
+Mnemonic_FYL2X,
+Mnemonic_FYL2XP1,
+Mnemonic_F2XM1,
+Mnemonic_FPATAN,
+Mnemonic_FXCH,
+Mnemonic_FSCALE,
+
+Mnemonic_XCHG,
+Mnemonic_DIV,                           // Unsigned Divide
+Mnemonic_IDIV,                          // Signed Divide
+Mnemonic_MUL,                           // Unsigned Multiply
+Mnemonic_IMUL,                          // Signed Multiply
+Mnemonic_INC,                           // Increment by 1
+Mnemonic_INT3,                          // Call break point
+
+Mnemonic_LEA,                           // Load Effective Address
+Mnemonic_LEAVE,                         // High Level Procedure Exit
+Mnemonic_LOOP,                          // Loop according to ECX counter
+Mnemonic_LOOPE,                          // Loop according to ECX counter
+Mnemonic_LOOPNE, Mnemonic_LOOPNZ = Mnemonic_LOOPNE, // Loop according to ECX
+Mnemonic_LAHF,                          // Load Flags into AH
+Mnemonic_MOVD,                          // Move Double word
+Mnemonic_MOVQ,                          // Move Quadword
+Mnemonic_MOVS8,
+Mnemonic_MOVS16,
+Mnemonic_MOVS32,
+Mnemonic_MOVS64,
+Mnemonic_MOVAPD,                        // Move Scalar Double-Precision Floating-Point Value
+Mnemonic_MOVSD,                         // Move Scalar Double-Precision Floating-Point Value
+Mnemonic_MOVSS,                         // Move Scalar Single-Precision Floating-Point Values
+Mnemonic_MOVSX,                         // Move with Sign-Extension
+Mnemonic_MOVZX,                         // Move with Zero-Extend
+Mnemonic_MULSD,                         // Multiply Scalar Double-Precision Floating-Point Values
+Mnemonic_MULSS,                         // Multiply Scalar Single-Precision Floating-Point Values
+Mnemonic_NEG,                           // Two's Complement Negation
+Mnemonic_NOP,                           // No Operation
+Mnemonic_NOT,                           // One's Complement Negation
+Mnemonic_OR,                            // Logical Inclusive OR
+Mnemonic_PREFETCH,                      // prefetch
+Mnemonic_PADDQ,                         // Add Packed Quadword Integers
+Mnemonic_PAND,                          // Logical AND
+Mnemonic_POR,                           // Bitwise Logical OR
+Mnemonic_PSUBQ,                         // Subtract Packed Quadword Integers
+Mnemonic_PANDN,
+Mnemonic_PSLLQ,
+Mnemonic_PSRLQ,
+Mnemonic_PXOR,                          // Logical Exclusive OR
+Mnemonic_POP,                           // Pop a Value from the Stack
+Mnemonic_POPFD,                         // Pop a Value of EFLAGS register from the Stack
+Mnemonic_PUSH,                          // Push Word or Doubleword Onto the Stack
+Mnemonic_PUSHFD,                        // Push EFLAGS Doubleword Onto the Stack
+Mnemonic_RET,                           // Return from Procedure
+
+Mnemonic_SETcc,                         // Set Byte on Condition
+    CCM(SET,O),
+    CCM(SET,NO),
+    CCM(SET,B), CCM(SET,NAE), CCM(SET,C),
+    CCM(SET,NB), CCM(SET,AE), CCM(SET,NC),
+    CCM(SET,Z), CCM(SET,E),
+    CCM(SET,NZ), CCM(SET,NE),
+    CCM(SET,BE), CCM(SET,NA),
+    CCM(SET,NBE), CCM(SET,A),
+    CCM(SET,S),
+    CCM(SET,NS),
+    CCM(SET,P), CCM(SET,PE),
+    CCM(SET,NP), CCM(SET,PO),
+    CCM(SET,L), CCM(SET,NGE),
+    CCM(SET,NL), CCM(SET,GE),
+    CCM(SET,LE), CCM(SET,NG),
+    CCM(SET,NLE), CCM(SET,G),
+
+Mnemonic_SAL, Mnemonic_SHL=Mnemonic_SAL,// Shift left
+Mnemonic_SAR,                           // Unsigned shift right
+Mnemonic_ROR,                           // Rotate right
+Mnemonic_RCR,                           // Rotate right through CARRY flag
+Mnemonic_ROL,                           // Rotate left
+Mnemonic_RCL,                           // Rotate left through CARRY flag
+Mnemonic_SHR,                           // Signed shift right
+Mnemonic_SHRD,                          // Double Precision Shift Right
+Mnemonic_SHLD,                          // Double Precision Shift Left
+
+Mnemonic_SBB,                           // Integer Subtraction with Borrow
+Mnemonic_SUB,                           // Subtract
+Mnemonic_SUBSD,                         // Subtract Scalar Double-Precision Floating-Point Values
+Mnemonic_SUBSS,                         // Subtract Scalar Single-Precision Floating-Point Values
+
+Mnemonic_TEST,                          // Logical Compare
+
+Mnemonic_UCOMISD,                       // Unordered Compare Scalar Double-Precision Floating-Point Values and Set EFLAGS
+Mnemonic_UCOMISS,                       // Unordered Compare Scalar Single-Precision Floating-Point Values and Set EFLAGS
+
+Mnemonic_XOR,                           // Logical Exclusive OR
+//
+// packed things,
+//
+Mnemonic_XORPD,                         // Bitwise Logical XOR for Double-Precision Floating-Point Values
+Mnemonic_XORPS,                         // Bitwise Logical XOR for Single-Precision Floating-Point Values
+
+Mnemonic_CVTDQ2PD,                      // Convert Packed Doubleword Integers to Packed Double-Precision Floating-Point Values
+Mnemonic_CVTTPD2DQ,                     // Convert with Truncation Packed Double-Precision Floating-Point Values to Packed Doubleword Integers
+
+Mnemonic_CVTDQ2PS,                      // Convert Packed Doubleword Integers to Packed Single-Precision Floating-Point Values
+Mnemonic_CVTTPS2DQ,                     // Convert with Truncation Packed Single-Precision Floating-Point Values to Packed Doubleword Integers
+//
+// String operations
+//
+Mnemonic_STD,                           // Set direction flag
+Mnemonic_CLD,                           // Clear direction flag
+Mnemonic_SCAS,                          // Scan string
+Mnemonic_STOS,                          // Store string
+
+//
+Mnemonic_WAIT,                          // Check pending pending unmasked floating-point exception
+Mnemonic_PADDB,    //!< Add packed byte integers
+Mnemonic_PADDW,    //!< Add packed word integers
+Mnemonic_PADDD,    //!< Add packed doubleword integers
+Mnemonic_PSUBB,    //!< Subtract packed byte integers
+Mnemonic_PSUBW,    //!< Subtract packed word integers
+Mnemonic_PSUBD,    //!< Subtract packed doubleword integers
+Mnemonic_PMULLW,   //!< Multiply packed word integers
+Mnemonic_PMULLD,   //!< Multiply packed doubleword integers
+Mnemonic_PSLLW,    //!< Shift words left and shift in 0s
+Mnemonic_PSLLD,    //!< Shift doublewords left and shift in 0s
+Mnemonic_PSRAW,    //!< Shift words right and shift in sign bits
+Mnemonic_PSRAD,    //!< Shift doublewords right and shift in sign bits
+Mnemonic_PSRLW,    //!< Shift words right and shift in 0s
+Mnemonic_PSRLD,    //!< Shift doublewords right and shift in 0s
+Mnemonic_PMOVSXBW, //!< Sign extend 8 packed signed 8-bit integers in the low 8 bytes to 8 packed signed 16-bit integers
+Mnemonic_PSHUFB,   //!< Shuffle bytes
+Mnemonic_PSHUFD,   //!< Shuffle doublewords
+Mnemonic_PSHUFLW,  //!< Shuffle packed low words
+Mnemonic_PSHUFHW,  //!< Shuffle packed high words
+Mnemonic_PHADDSW,  //!< Add 16-bit signed integers horizontally, then pack saturated integers
+Mnemonic_PHADDW,   //!< Add 16-bit signed integers horizontally, then pack
+Mnemonic_PHADDD,   //!< Add 32-bit signed integers horizontally, then pack
+Mnemonic_PHSUBSW,  //!< Subtract 16-bit signed integers horizontally, then pack saturated integers
+Mnemonic_PHSUBW,   //!< Subtract 16-bit signed integers horizontally, then pack
+Mnemonic_PHSUBD,   //!< Subtract 32-bit signed integers horizontally, then pack
+Mnemonic_PEXTRB,   //!< Extract a byte integer value from xmm
+Mnemonic_PEXTRW,   //!< Extract a word integer value from xmm
+Mnemonic_PEXTRD,   //!< Extract a doubleword integer value from xmm
+Mnemonic_MOVDQA,   //!< Move aligned double quadword
+Mnemonic_SHUFPS,   //!< Shuffle single words
+Mnemonic_MOVAPS,   //!< Move aligned single word
+
+//
+Mnemonic_Count
+} Mnemonic;
+
+#undef CCM
+
+ENCODER_NAMESPACE_END
+
+#endif  // ifndef _ENCODER_DEFS_EXT_H_
diff --git a/libpixelflinger/codeflinger/x86/libenc/enc_prvt.h b/libpixelflinger/codeflinger/x86/libenc/enc_prvt.h
new file mode 100644
index 0000000..343b161
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/enc_prvt.h
@@ -0,0 +1,382 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+#ifndef __ENC_PRVT_H_INCLUDED__
+#define __ENC_PRVT_H_INCLUDED__
+
+#include "enc_base.h"
+
+ENCODER_NAMESPACE_START
+/*
+ * @file
+ * @brief Contains some definitions/constants and other stuff used by the
+ *        Encoder internally.
+ */
+
+enum OpcodeByteKind {
+    //OpcodeByteKind_Opcode = 0x0000,
+    OpcodeByteKind_ZeroOpcodeByte           = 0x0100,
+    //
+    // The names _SlashR,  _SlahsNum, _ib, _iw, etc
+    // represent the appropriate abbreviations used
+    // in the mnemonic descriptions in the Intel's arch manual.
+    //
+    OpcodeByteKind_SlashR                   = 0x0200,
+    OpcodeByteKind_SlashNum                 = 0x0300,
+    OpcodeByteKind_ib                       = 0x0400,
+    OpcodeByteKind_iw                       = 0x0500,
+    OpcodeByteKind_id                       = 0x0600,
+#ifdef _EM64T_
+    OpcodeByteKind_io                       = 0x0700,
+#endif
+    OpcodeByteKind_cb                       = 0x0800,
+    OpcodeByteKind_cw                       = 0x0900,
+    OpcodeByteKind_cd                       = 0x0A00,
+    //OpcodeByteKind_cp                     = 0x0B00,
+    //OpcodeByteKind_co                     = 0x0C00,
+    //OpcodeByteKind_ct                     = 0x0D00,
+
+    OpcodeByteKind_rb                       = 0x0E00,
+    OpcodeByteKind_rw                       = 0x0F00,
+    OpcodeByteKind_rd                       = 0x1000,
+#ifdef _EM64T_
+    OpcodeByteKind_ro                       = 0x1100,
+    //OpcodeByteKind_REX                    = 0x1200,
+    OpcodeByteKind_REX_W                    = 0x1300,
+#endif
+    OpcodeByteKind_plus_i                   = 0x1400,
+    /**
+        * a special marker, means 'no opcode on the given position'
+        * used in opcodes array, to specify the empty slot, say
+        * to fill an em64t-specific opcode on ia32.
+        * last 'e' made lowercase to avoid a mess with 'F' in
+        * OpcodeByteKind_LAST .
+        */
+    OpcodeByteKind_EMPTY                    = 0xFFFE,
+    /**
+        * a special marker, means 'no more opcodes in the array'
+        * used in in opcodes array to show that there are no more
+        * opcodes in the array for a given mnemonic.
+        */
+    OpcodeByteKind_LAST                     = 0xFFFF,
+    /**
+        * a mask to extract the OpcodeByteKind
+        */
+    OpcodeByteKind_KindMask                 = 0xFF00,
+    /**
+        * a mask to extract the opcode byte when presented
+        */
+    OpcodeByteKind_OpcodeMask               = 0x00FF
+};
+
+#ifdef USE_ENCODER_DEFINES
+
+#define N           {0, 0, 0, 0 }
+#define U           {1, 0, 1, OpndRole_Use }
+#define D           {1, 1, 0, OpndRole_Def }
+#define DU          {1, 1, 1, OpndRole_Def|OpndRole_Use }
+
+#define U_U         {2, 0, 2, OpndRole_Use<<2 | OpndRole_Use }
+#define D_U         {2, 1, 1, OpndRole_Def<<2 | OpndRole_Use }
+#define D_DU        {2, 2, 1, OpndRole_Def<<2 | (OpndRole_Def|OpndRole_Use) }
+#define DU_U        {2, 1, 2, ((OpndRole_Def|OpndRole_Use)<<2 | OpndRole_Use) }
+#define DU_DU       {2, 2, 2, ((OpndRole_Def|OpndRole_Use)<<2 | (OpndRole_Def|OpndRole_Use)) }
+
+#define DU_DU_DU    {3, 3, 3, ((OpndRole_Def|OpndRole_Use)<<4) | ((OpndRole_Def|OpndRole_Use)<<2) | (OpndRole_Def|OpndRole_Use) }
+#define DU_DU_U     {3, 2, 3, (((OpndRole_Def|OpndRole_Use)<<4) | ((OpndRole_Def|OpndRole_Use)<<2) | OpndRole_Use) }
+#define D_DU_U      {3, 2, 2, (((OpndRole_Def)<<4) | ((OpndRole_Def|OpndRole_Use)<<2) | OpndRole_Use) }
+#define D_U_U       {3, 1, 2, (((OpndRole_Def)<<4) | ((OpndRole_Use)<<2) | OpndRole_Use) }
+
+// Special encoding of 0x00 opcode byte. Note: it's all O-s, not zeros.
+#define OxOO        OpcodeByteKind_ZeroOpcodeByte
+
+#define Size16      InstPrefix_OpndSize
+
+#define _r          OpcodeByteKind_SlashR
+
+#define _0          OpcodeByteKind_SlashNum|0
+#define _1          OpcodeByteKind_SlashNum|1
+#define _2          OpcodeByteKind_SlashNum|2
+#define _3          OpcodeByteKind_SlashNum|3
+#define _4          OpcodeByteKind_SlashNum|4
+#define _5          OpcodeByteKind_SlashNum|5
+#define _6          OpcodeByteKind_SlashNum|6
+#define _7          OpcodeByteKind_SlashNum|7
+
+// '+i' for floating-point instructions
+#define _i          OpcodeByteKind_plus_i
+
+
+#define ib          OpcodeByteKind_ib
+#define iw          OpcodeByteKind_iw
+#define id          OpcodeByteKind_id
+
+#define cb          OpcodeByteKind_cb
+#define cw          OpcodeByteKind_cw
+#define cd          OpcodeByteKind_cd
+
+#define rb          OpcodeByteKind_rb
+#define rw          OpcodeByteKind_rw
+#define rd          OpcodeByteKind_rd
+
+#define AL          {OpndKind_GPReg, OpndSize_8, OpndExt_Any, RegName_AL}
+#define AH          {OpndKind_GPReg, OpndSize_8, OpndExt_Any, RegName_AH}
+#define AX          {OpndKind_GPReg, OpndSize_16, OpndExt_Any, RegName_AX}
+#define EAX         {OpndKind_GPReg, OpndSize_32, OpndExt_Any, RegName_EAX}
+#ifdef _EM64T_
+    #define RAX     {OpndKind_GPReg, OpndSize_64, OpndExt_Any, RegName_RAX }
+#endif
+
+#define CL          {OpndKind_GPReg, OpndSize_8, OpndExt_Any, RegName_CL}
+#define ECX         {OpndKind_GPReg, OpndSize_32, OpndExt_Any, RegName_ECX}
+#ifdef _EM64T_
+    #define RCX         {OpndKind_GPReg, OpndSize_64, OpndExt_Any, RegName_RCX}
+#endif
+
+#define DX          {OpndKind_GPReg, OpndSize_16, OpndExt_Any, RegName_DX}
+#define EDX         {OpndKind_GPReg, OpndSize_32, OpndExt_Any, RegName_EDX}
+#ifdef _EM64T_
+    #define RDX     { OpndKind_GPReg, OpndSize_64, OpndExt_Any, RegName_RDX }
+#endif
+
+#define ESI         {OpndKind_GPReg, OpndSize_32, OpndExt_Any, RegName_ESI}
+#ifdef _EM64T_
+    #define RSI     { OpndKind_GPReg, OpndSize_64, OpndExt_Any, RegName_RSI }
+#endif
+
+#define EDI         {OpndKind_GPReg, OpndSize_32, OpndExt_Any, RegName_EDI}
+#ifdef _EM64T_
+    #define RDI     { OpndKind_GPReg, OpndSize_64, OpndExt_Any, RegName_RDI }
+#endif
+
+#define r8          {OpndKind_GPReg, OpndSize_8, OpndExt_Any, RegName_Null}
+#define r16         {OpndKind_GPReg, OpndSize_16, OpndExt_Any, RegName_Null}
+#define r32         {OpndKind_GPReg, OpndSize_32, OpndExt_Any, RegName_Null}
+#ifdef _EM64T_
+    #define r64     { OpndKind_GPReg, OpndSize_64, OpndExt_Any, RegName_Null }
+#endif
+
+#define r_m8        {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_8, OpndExt_Any, RegName_Null}
+#define r_m16       {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_16, OpndExt_Any, RegName_Null}
+#define r_m32       {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_32, OpndExt_Any, RegName_Null}
+
+#define r_m8s        {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_8, OpndExt_Signed, RegName_Null}
+#define r_m16s       {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_16, OpndExt_Signed, RegName_Null}
+#define r_m32s       {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_32, OpndExt_Signed, RegName_Null}
+
+#define r_m8u        {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_8, OpndExt_Zero, RegName_Null}
+#define r_m16u       {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_16, OpndExt_Zero, RegName_Null}
+#define r_m32u       {(OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_32, OpndExt_Zero, RegName_Null}
+
+//'m' was only used in LEA mnemonic, but is replaced with
+// set of exact sizes. See more comments for LEA instruction in TheTable.
+//#define m           {OpndKind_Mem, OpndSize_Null, RegName_Null}
+#define m8          {OpndKind_Mem, OpndSize_8, OpndExt_Any, RegName_Null}
+#define m16         {OpndKind_Mem, OpndSize_16, OpndExt_Any, RegName_Null}
+#define m32         {OpndKind_Mem, OpndSize_32, OpndExt_Any, RegName_Null}
+#define m64         {OpndKind_Mem, OpndSize_64, OpndExt_Any, RegName_Null}
+#ifdef _EM64T_
+    #define r_m64   { (OpndKind)(OpndKind_GPReg|OpndKind_Mem), OpndSize_64, OpndExt_Any, RegName_Null }
+#endif
+
+#define imm8        {OpndKind_Imm, OpndSize_8, OpndExt_Any, RegName_Null}
+#define imm16       {OpndKind_Imm, OpndSize_16, OpndExt_Any, RegName_Null}
+#define imm32       {OpndKind_Imm, OpndSize_32, OpndExt_Any, RegName_Null}
+
+#define imm8s        {OpndKind_Imm, OpndSize_8, OpndExt_Signed, RegName_Null}
+#define imm16s       {OpndKind_Imm, OpndSize_16, OpndExt_Signed, RegName_Null}
+#define imm32s       {OpndKind_Imm, OpndSize_32, OpndExt_Signed, RegName_Null}
+
+#define imm8u        {OpndKind_Imm, OpndSize_8, OpndExt_Zero, RegName_Null}
+#define imm16u       {OpndKind_Imm, OpndSize_16, OpndExt_Zero, RegName_Null}
+#define imm32u       {OpndKind_Imm, OpndSize_32, OpndExt_Zero, RegName_Null}
+
+#ifdef _EM64T_
+    #define imm64   {OpndKind_Imm, OpndSize_64, OpndExt_Any, RegName_Null }
+#endif
+
+//FIXME: moff-s are in fact memory refs, but presented as immediate.
+// Need to specify this in OpndDesc.
+#define moff8        {OpndKind_Imm, OpndSize_32, OpndExt_Any, RegName_Null}
+#define moff16       {OpndKind_Imm, OpndSize_32, OpndExt_Any, RegName_Null}
+#define moff32       {OpndKind_Imm, OpndSize_32, OpndExt_Any, RegName_Null}
+#ifdef _EM64T_
+    #define moff64       {OpndKind_Imm, OpndSize_64, OpndExt_Any, RegName_Null}
+#endif
+
+
+#define rel8        {OpndKind_Imm, OpndSize_8, OpndExt_Any, RegName_Null}
+#define rel16       {OpndKind_Imm, OpndSize_16, OpndExt_Any, RegName_Null}
+#define rel32       {OpndKind_Imm, OpndSize_32, OpndExt_Any, RegName_Null}
+
+#define mm64        {OpndKind_MMXReg, OpndSize_64, OpndExt_Any, RegName_Null}
+#define mm_m64      {(OpndKind)(OpndKind_MMXReg|OpndKind_Mem), OpndSize_64, OpndExt_Any, RegName_Null}
+
+#define xmm64       {OpndKind_XMMReg, OpndSize_64, OpndExt_Any, RegName_Null}
+#define xmm_m64     {(OpndKind)(OpndKind_XMMReg|OpndKind_Mem), OpndSize_64, OpndExt_Any, RegName_Null}
+
+#define xmm32       {OpndKind_XMMReg, OpndSize_32, OpndExt_Any, RegName_Null}
+#define xmm_m32     {(OpndKind)(OpndKind_XMMReg|OpndKind_Mem), OpndSize_32, OpndExt_Any, RegName_Null}
+
+#define FP0S        {OpndKind_FPReg, OpndSize_32, OpndExt_Any, RegName_FP0S}
+#define FP0D        {OpndKind_FPReg, OpndSize_64, OpndExt_Any, RegName_FP0D}
+#define FP1S        {OpndKind_FPReg, OpndSize_32, OpndExt_Any, RegName_FP1S}
+#define FP1D        {OpndKind_FPReg, OpndSize_64, OpndExt_Any, RegName_FP1D}
+#define fp32        {OpndKind_FPReg, OpndSize_32, OpndExt_Any, RegName_Null}
+#define fp64        {OpndKind_FPReg, OpndSize_64, OpndExt_Any, RegName_Null}
+
+#ifdef _EM64T_
+    #define io      OpcodeByteKind_io
+    #define REX_W   OpcodeByteKind_REX_W
+
+#endif
+
+#endif // USE_ENCODER_DEFINES
+
+/**
+ * @brief Represents the REX part of instruction.
+ */
+struct  Rex {
+    unsigned char b : 1;
+    unsigned char x : 1;
+    unsigned char r : 1;
+    unsigned char w : 1;
+    unsigned char dummy : 4;        // must be '0100'b
+    unsigned int  :24;
+};
+
+/**
+ * @brief Describes SIB (scale,index,base) byte.
+ */
+struct SIB {
+    unsigned char base:3;
+    unsigned char index:3;
+    unsigned char scale:2;
+    unsigned int  padding:24;
+};
+/**
+ * @brief Describes ModRM byte.
+ */
+struct ModRM
+{
+    unsigned char rm:3;
+    unsigned char reg:3;
+    unsigned char mod:2;
+    unsigned int  padding:24;
+};
+
+
+
+/**
+* exactly the same as EncoderBase::OpcodeDesc, but also holds info about
+* platform on which the opcode is applicable.
+*/
+struct OpcodeInfo {
+    enum platform {
+        /// an opcode is valid on all platforms
+        all,
+        // opcode is valid on IA-32 only
+        em64t,
+        // opcode is valid on Intel64 only
+        ia32,
+        // opcode is added for the sake of disassembling, should not be used in encoding
+        decoder,
+        // only appears in master table, replaced with 'decoder' in hashed version
+        decoder32,
+        // only appears in master table, replaced with 'decoder' in hashed version
+        decoder64,
+    };
+    platform                        platf;
+    unsigned                        opcode[4+1+1];
+    EncoderBase::OpndDesc           opnds[EncoderBase::MAX_NUM_OPCODE_OPERANDS];
+    EncoderBase::OpndRolesDesc      roles;
+};
+
+/**
+ * @defgroup MF_ Mnemonic flags
+*/
+
+    /**
+ * Operation has no special properties.
+    */
+#define MF_NONE             (0x00000000)
+    /**
+ * Operation affects flags
+    */
+#define MF_AFFECTS_FLAGS    (0x00000001)
+    /**
+ * Operation uses flags - conditional operations, ADC/SBB/ETC
+    */
+#define MF_USES_FLAGS       (0x00000002)
+    /**
+ * Operation is conditional - MOVcc/SETcc/Jcc/ETC
+    */
+#define MF_CONDITIONAL      (0x00000004)
+/**
+ * Operation is symmetric - its args can be swapped (ADD/MUL/etc).
+ */
+#define MF_SYMMETRIC        (0x00000008)
+/**
+ * Operation is XOR-like - XOR, SUB - operations of 'arg,arg' is pure def,
+ * without use.
+ */
+#define MF_SAME_ARG_NO_USE  (0x00000010)
+
+///@} // ~MNF
+
+/**
+ * @see same structure as EncoderBase::MnemonicDesc, but carries
+ * MnemonicInfo::OpcodeInfo[] instead of OpcodeDesc[].
+ * Only used during prebuilding the encoding tables, thus it's hidden under
+ * the appropriate define.
+ */
+struct MnemonicInfo {
+    /**
+    * The mnemonic itself
+    */
+    Mnemonic    mn;
+    /**
+     * Various characteristics of mnemonic.
+     * @see MF_
+     */
+    unsigned    flags;
+    /**
+     * Number of args/des/uses/roles for the operation. For the operations
+     * which may use different number of operands (i.e. IMUL/SHL) use the
+     * most common value, or leave '0' if you are sure this info is not
+     * required.
+     */
+    EncoderBase::OpndRolesDesc              roles;
+    /**
+     * Print name of the mnemonic
+     */
+    const char *                            name;
+    /**
+     * Array of opcodes.
+     * The terminating opcode description always have OpcodeByteKind_LAST
+     * at the opcodes[i].opcode[0].
+     * The size of '25' has nothing behind it, just counted the max
+     * number of opcodes currently used (MOV instruction).
+     */
+    OpcodeInfo                              opcodes[25];
+};
+
+ENCODER_NAMESPACE_END
+
+#endif  // ~__ENC_PRVT_H_INCLUDED__
diff --git a/libpixelflinger/codeflinger/x86/libenc/enc_tabl.cpp b/libpixelflinger/codeflinger/x86/libenc/enc_tabl.cpp
new file mode 100644
index 0000000..b60d6b7
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/enc_tabl.cpp
@@ -0,0 +1,2164 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h> //qsort
+#include <string.h>
+#include <memory.h>
+#include <errno.h>
+#include <stdlib.h>
+
+
+// need to use EM64T-specifics - new registers, defines from enc_prvt, etc...
+#if !defined(_EM64T_)
+    #define UNDEF_EM64T
+    #define _EM64T_
+#endif
+
+#define USE_ENCODER_DEFINES
+#include "enc_prvt.h"
+#include "enc_defs.h"
+
+#ifdef UNDEF_EM64T
+    #undef _EM64T_
+#endif
+
+//Android x86
+#if 0 //!defined(_HAVE_MMX_)
+    #define Mnemonic_PADDQ  Mnemonic_Null
+    #define Mnemonic_PAND   Mnemonic_Null
+    #define Mnemonic_POR    Mnemonic_Null
+    #define Mnemonic_PSUBQ  Mnemonic_Null
+#endif
+
+ENCODER_NAMESPACE_START
+
+
+EncoderBase::MnemonicDesc EncoderBase::mnemonics[Mnemonic_Count];
+EncoderBase::OpcodeDesc EncoderBase::opcodes[Mnemonic_Count][MAX_OPCODES];
+unsigned char EncoderBase::opcodesHashMap[Mnemonic_Count][HASH_MAX];
+
+
+/**
+ * @file
+ * @brief 'Master' copy of encoding data.
+ */
+
+/*
+This file contains a 'master copy' of encoding table - this is the info used
+by both generator of native instructions (EncoderBase class) and by
+disassembling routines. The first one uses an info how to encode the
+instruction, and the second does an opposite - several separate tables are
+built at runtime from this main table.
+
+=============================================================================
+
+The table was designed for easy support and maintenance. Thus, it was made as
+much close as possible to the Intel's IA32 Architecture Manual descriptions.
+The info is based on the latest (at the moment of writing) revision which is
+June 2005, order number 253666-016.
+
+Normally, almost all of opcodes in the 'master' table represented exactly as
+they are shown in the Intel's Architecture manual (well, with slashes
+replaced with underscore). There are several exclusions especially marked.
+
+Normally, to add an opcode/instruction, one only need to copy the whole
+string from the manual, and simply replace '/' with '_'.
+
+I.e., TheManual reads for DEC:
+    (1)     FE /1 DEC r/m8 Valid Valid Decrement r/m8 by 1.
+    (2)     REX + FE /1 DEC r/m8* Valid N.E. Decrement r/m8 by 1.
+    (3)     REX.W + FF /1 DEC r/m64 Valid N.E. Decrement r/m64 by 1.
+
+1. Note, that there is no need to explicitly specify REX-based opcodes for
+    instruction to handle additional registers on EM64T:
+
+    (1)     FE /1 DEC r/m8 Valid Valid Decrement r/m8 by 1.
+    (3)     REX.W + FF /1 DEC r/m64 Valid N.E. Decrement r/m64 by 1.
+
+2. Copy the string, strip off the text comments, replace '/'=>'_'. Note, that
+    the second line is for EM64T only
+
+    (1)     FE /1 DEC r/m8
+    (3)     REX.W + FF /1 DEC r/m64
+
+3. Fill out the mnemonic, opcode parameters parts
+
+    BEGIN_MNEMONIC(DEC, MF_AFFECTS_FLAGS, DU)
+    BEGIN_OPCODES()
+        {OpcodeInfo::all,   {0xFE, _1},         {r_m8},         DU },
+        {OpcodeInfo::em64t, {REX_W, 0xFF, _1},  {r_m64},        DU },
+
+    DU here - one argument, it's used and defined
+
+4. That's it, that simple !
+
+The operand roles (DU here) are used by Jitrino's optimizing engine to
+perform data flow analysis. It also used to store/obtain number of operands.
+
+Special cases are (see the table for details):
+LEA
+Some FPU operations (i.e. FSTP)
+packed things (XORPD, XORPS, CVTDQ2PD, CVTTPD2DQ)
+
+Also, the Jitrino's needs require to specify all operands - including
+implicit ones (see IMUL).
+
+The master table iself does not need to be ordered - it's get sorted before
+processing. It's recommended (though it's not a law) to group similar
+instructions together - i.e. FPU instructions, MMX, etc.
+
+=============================================================================
+
+The encoding engine builds several tables basing on the 'master' one (here
+'mnemonic' is a kind of synonim for 'instruction'):
+
+- list of mnemonics which holds general info about instructions
+    (EncoderBase::mnemonics)
+- an array of opcodes descriptions (EncodeBase::opcodes)
+- a mapping between a hash value and an opcode description record for a given
+    mnemonic (EncoderBase::opcodesHashMap)
+
+The EncoderBase::mnemonics holds general info about instructions.
+The EncoderBase::opcodesHashMap is used for fast opcode selection basing on
+a hash value.
+The EncodeBase::opcodes is used for the encoding itself.
+
+=============================================================================
+The hash value is calculated and used as follows:
+
+JIT-ted code uses the following operand sizes: 8-, 16-, 32- and 64-bits and
+size for an operand can be encoded in just 2 bits.
+
+The following operand locations are available: one of registers - GP, FP,
+MMX, XMM (not taking segment registers), a memory and an immediate, which
+gives us 6 variants and can be enumerated in 3 bits.
+
+As a grand total, the the whole operand's info needed for opcode selection
+can be packed in 5 bits. Taking into account the IMUL mnemonic with its 3
+operands (including implicit ones), we're getting 15 bits per instruction and
+the complete table is about 32768 items per single instruction.
+
+Seems too many, but luckily, the 15 bit limit will never be reached: the
+worst case is IMUL with its 3 operands:
+(IMUL r64, r/m64, imm32)/(IMUL r32, r/m32, imm32).
+So, assigning lowest value to GP register, the max value of hash can be
+reduced.
+
+The hash values to use are:
+sizes:
+        8               -> 11
+        16              -> 10
+        32              -> 01
+        64              -> 00
+locations:
+        gp reg          -> 000
+        memory          -> 001
+        fp reg          -> 010
+        mmx reg         -> 011
+        xmm reg         -> 100
+        immediate       -> 101
+and the grand total for the worst case would be
+[ GP 32] [GP  32] [Imm 32]
+[000-01] [000-01] [101 01] = 1077
+
+However, the implicit operands adds additional value, and the worstest case
+is 'SHLD r_m32, r32, CL=r8'. This gives us the maximum number of:
+
+[mem 32] [GP  32] [GP  8b]
+[001-01] [000-01] [000-11] = 5155.
+
+The max number is pretty big and the hash functions is quite rare, thus it
+is not resonable to use a direct addressing i.e.
+OpcodeDesc[mnemonic][hash_code] - there would be a huge waste of space.
+
+Instead, we use a kind of mapping: the opcodes info is stored in packed
+(here: non rare) array. The max number of opcodes will not exceed 255 for
+each instruction. And we have an index array in which we store a mapping
+between a hash code value and opcode position for each given instruction.
+
+Sounds a bit sophisticated, but in real is simple, the opcode gets selected
+in 2 simple steps:
+
+1. Select [hash,mnemonic] => 'n'.
+
+The array is pretty rare - many cells contain 0xFF which
+means 'invalid hash - no opcode with given characteristics'
+
+char EnbcoderBase::opcodesHashMap[Mnemonic_Count][HASH_MAX] =
+
++----+----+----+----+----+----+
+| 00 | 05 | FF | FF | 03 | 12 | ...
+|---------+-------------------+
+| 12 | FF | FF |  n | 04 | 25 | ...   <- Mnemonic
+|-----------------------------+
+| FF | 11 | FF | 10 | 13 | .. | ...
++-----------------------------+
+     ...         ^
+                 |
+                hash
+
+2. Select [n,mnemonic] => 'opcode_desc11'
+
+OpcodeDesc      EncoderBase::opcodes[Mnemonic_Count][MAX_OPCODES] =
+
++---------------+---------------+---------------+---------------+
+| opcode_desc00 | opcode_desc01 | opcode_desc02 | last_opcode   | ...
++---------------+---------------+---------------+---------------+
+| opcode_desc10 | opcode_desc11 | last_opcode   | xxx           | <- Mnemonic
++---------------+---------------+---------------+---------------+
+| opcode_desc20 | opcode_desc21 | opcode_desc22 | opcode_desc23 | ...
++---------------+---------------+---------------+---------------+
+     ...
+                      ^
+                      |
+                      n
+
+Now, use 'opcode_desc11'.
+
+=============================================================================
+The array of opcodes descriptions (EncodeBase::opcodes) is specially prepared
+to maximize performance - the EncoderBase::encode() is quite hot on client
+applications for the Jitrino/Jitrino.JET.
+The preparation is that opcode descriptions from the 'master' encoding table
+are preprocessed and a special set of OpcodeDesc prepared:
+First, the 'raw' opcode bytes are extracted. Here, 'raw' means the bytes that
+do not depened on any operands values, do not require any analysis and can be
+simply copied into the output buffer during encoding. Also, number of these
+'raw' bytes is counted. The fields are OpcodeDesc::opcode and
+OpcodeDesc::opcode_len.
+
+Then the fisrt non-implicit operand found and its index is stored in
+OpcodeDesc::first_opnd.
+
+The bytes that require processing and analysis ('/r', '+i', etc) are
+extracted and stored in OpcodeDesc::aux0 and OpcodeDesc::aux1 fields.
+
+Here, a special trick is performed:
+    Some opcodes have register/memory operand, but this is not reflected in
+    opcode column - for example, (MOVQ xmm64, xmm_m64). In this case, a fake
+    '_r' added to OpcodeDesc::aux field.
+    Some other opcodes have immediate operands, but this is again not
+    reflected in opcode column - for example, CALL cd or PUSH imm32.
+    In this case, a fake '/cd' or fake '/id' added to appropriate
+    OpcodeDesc::aux field.
+
+The OpcodeDesc::last is non-zero for the final OpcodeDesc record (which does
+not have valid data itself).
+*/
+
+// TODO: To extend flexibility, replace bool fields in MnemonicDesc &
+// MnemonicInfo with a set of flags packed into integer field.
+
+unsigned short EncoderBase::getHash(const OpcodeInfo* odesc)
+{
+    /*
+    NOTE: any changes in the hash computation must be stricty balanced with
+    EncoderBase::Operand::hash_it and EncoderBase::Operands()
+    */
+    unsigned short hash = 0;
+    // The hash computation, uses fast way - table selection instead of if-s.
+    if (odesc->roles.count > 0) {
+        OpndKind kind = odesc->opnds[0].kind;
+        OpndSize size = odesc->opnds[0].size;
+        assert(kind<COUNTOF(kind_hash));
+        assert(size<COUNTOF(size_hash));
+        hash = get_kind_hash(kind) | get_size_hash(size);
+    }
+
+    if (odesc->roles.count > 1) {
+        OpndKind kind = odesc->opnds[1].kind;
+        OpndSize size = odesc->opnds[1].size;
+        assert(kind<COUNTOF(kind_hash));
+        assert(size<COUNTOF(size_hash));
+        hash = (hash<<HASH_BITS_PER_OPERAND) |
+               (get_kind_hash(kind) | get_size_hash(size));
+    }
+
+    if (odesc->roles.count > 2) {
+        OpndKind kind = odesc->opnds[2].kind;
+        OpndSize size = odesc->opnds[2].size;
+        assert(kind<COUNTOF(kind_hash));
+        assert(size<COUNTOF(size_hash));
+        hash = (hash<<HASH_BITS_PER_OPERAND) |
+            (get_kind_hash(kind) | get_size_hash(size));
+    }
+    assert(hash <= HASH_MAX);
+    return hash;
+}
+
+
+#define BEGIN_MNEMONIC(mn, flags, roles)     \
+        { Mnemonic_##mn, flags, roles, #mn,
+#define END_MNEMONIC() },
+#define BEGIN_OPCODES() {
+#define END_OPCODES()   { OpcodeInfo::all, {OpcodeByteKind_LAST}, {}, {0, 0, 0, 0}}}
+
+
+static MnemonicInfo masterEncodingTable[] = {
+//
+// Null
+//
+BEGIN_MNEMONIC(Null, MF_NONE, N)
+BEGIN_OPCODES()
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(LAHF, MF_USES_FLAGS, D)
+BEGIN_OPCODES()
+// TheManual says it's not always supported in em64t mode, thus excluding it
+    {OpcodeInfo::ia32,    {0x9F},         {EAX}, D },
+END_OPCODES()
+END_MNEMONIC()
+//
+// ALU mnemonics - add, adc, or, xor, and, cmp, sub, sbb
+// as they differ only in the opcode extention (/digit) number and
+// in which number the opcode start from, the opcode definitions
+// for those instructions are packed together
+//
+// The 'opcode_starts_from' and 'opcode_ext' in DEFINE_ALU_OPCODES()
+// are enough to define OpcodeInfo::all opcodes and the 'first_opcode'
+// parameter is only due to ADD instruction, which requires an zero opcode
+// byte which, in turn, is coded especially in the current coding scheme.
+//
+
+#define DEFINE_ALU_OPCODES( opc_ext, opcode_starts_from, first_opcode, def_use ) \
+\
+    {OpcodeInfo::decoder,   {opcode_starts_from + 4, ib},           {AL,    imm8},  DU_U },\
+    {OpcodeInfo::decoder,   {Size16, opcode_starts_from + 5, iw},   {AX,    imm16}, DU_U },\
+    {OpcodeInfo::decoder,   {opcode_starts_from + 5, id},           {EAX,   imm32}, DU_U },\
+    {OpcodeInfo::decoder64, {REX_W, opcode_starts_from+5, id},      {RAX,   imm32s},DU_U },\
+\
+    {OpcodeInfo::all,       {0x80, opc_ext, ib},          {r_m8,  imm8},    def_use },\
+    {OpcodeInfo::all,       {Size16, 0x81, opc_ext, iw},  {r_m16, imm16},   def_use },\
+    {OpcodeInfo::all,       {0x81, opc_ext, id},          {r_m32, imm32},   def_use },\
+    {OpcodeInfo::em64t,     {REX_W, 0x81, opc_ext, id},   {r_m64, imm32s},  def_use },\
+\
+    {OpcodeInfo::all,       {Size16, 0x83, opc_ext, ib},  {r_m16, imm8s},   def_use },\
+    {OpcodeInfo::all,       {0x83, opc_ext, ib},          {r_m32, imm8s},   def_use },\
+    {OpcodeInfo::em64t,     {REX_W, 0x83, opc_ext, ib},   {r_m64, imm8s},   def_use },\
+\
+    {OpcodeInfo::all,       {first_opcode,  _r},          {r_m8,  r8},      def_use },\
+\
+    {OpcodeInfo::all,       {Size16, opcode_starts_from+1,  _r},  {r_m16, r16},   def_use },\
+    {OpcodeInfo::all,       {opcode_starts_from+1,  _r},  {r_m32, r32},   def_use },\
+    {OpcodeInfo::em64t,     {REX_W, opcode_starts_from+1, _r},    {r_m64, r64},   def_use },\
+\
+    {OpcodeInfo::all,       {opcode_starts_from+2,  _r},  {r8,    r_m8},  def_use },\
+\
+    {OpcodeInfo::all,       {Size16, opcode_starts_from+3,  _r},  {r16,   r_m16}, def_use },\
+    {OpcodeInfo::all,       {opcode_starts_from+3,  _r},  {r32,   r_m32}, def_use },\
+    {OpcodeInfo::em64t,     {REX_W, opcode_starts_from+3, _r},    {r64,   r_m64}, def_use },
+
+BEGIN_MNEMONIC(ADD, MF_AFFECTS_FLAGS|MF_SYMMETRIC, DU_U)
+BEGIN_OPCODES()
+    DEFINE_ALU_OPCODES(_0, 0x00, OxOO, DU_U )
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(OR, MF_AFFECTS_FLAGS|MF_SYMMETRIC, DU_U)
+BEGIN_OPCODES()
+    DEFINE_ALU_OPCODES(_1, 0x08, 0x08, DU_U )
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(ADC, MF_AFFECTS_FLAGS|MF_USES_FLAGS|MF_SYMMETRIC, DU_U)
+BEGIN_OPCODES()
+    DEFINE_ALU_OPCODES(_2, 0x10, 0x10, DU_U )
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(SBB, MF_AFFECTS_FLAGS|MF_USES_FLAGS, DU_U)
+BEGIN_OPCODES()
+    DEFINE_ALU_OPCODES(_3, 0x18, 0x18, DU_U )
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(AND, MF_AFFECTS_FLAGS|MF_SYMMETRIC, DU_U)
+BEGIN_OPCODES()
+    DEFINE_ALU_OPCODES(_4, 0x20, 0x20, DU_U )
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(SUB, MF_AFFECTS_FLAGS|MF_SAME_ARG_NO_USE, DU_U)
+BEGIN_OPCODES()
+    DEFINE_ALU_OPCODES(_5, 0x28, 0x28, DU_U )
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(XOR, MF_AFFECTS_FLAGS|MF_SYMMETRIC|MF_SAME_ARG_NO_USE, DU_U)
+BEGIN_OPCODES()
+    DEFINE_ALU_OPCODES( _6, 0x30, 0x30, DU_U )
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CMP, MF_AFFECTS_FLAGS, U_U)
+BEGIN_OPCODES()
+    DEFINE_ALU_OPCODES( _7, 0x38, 0x38, U_U )
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CMPXCHG, MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0xB0, _r},           {r_m8, r8, AL},     DU_DU_DU },
+    {OpcodeInfo::all,   {Size16, 0x0F, 0xB1, _r},   {r_m16, r16, AX},   DU_DU_DU },
+    {OpcodeInfo::all,   {0x0F, 0xB1, _r},           {r_m32, r32, EAX},  DU_DU_DU},
+    {OpcodeInfo::em64t, {REX_W, 0x0F, 0xB1, _r},    {r_m64, r64, RAX},  DU_DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CMPXCHG8B, MF_AFFECTS_FLAGS, D)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0xC7, _1},         {m64},     DU },
+END_OPCODES()
+END_MNEMONIC()
+
+#undef DEFINE_ALU_OPCODES
+//
+//
+//
+BEGIN_MNEMONIC(ADDSD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF2, 0x0F, 0x58, _r},   {xmm64, xmm_m64},   DU_U},
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(ADDSS, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x58, _r},   {xmm32, xmm_m32},   DU_U},
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(BSF, MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0xBC},   {r32, r_m32},   D_U},
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(BSR, MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0xBD},   {r32, r_m32},   D_U},
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(CALL, MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xE8, cd},        {rel32},     U },
+    {OpcodeInfo::ia32,  {Size16, 0xE8, cw}, {rel16},    U },
+    {OpcodeInfo::ia32,  {0xFF, _2},        {r_m32},     U },
+    {OpcodeInfo::em64t, {0xFF, _2},        {r_m64},     U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CMC, MF_USES_FLAGS|MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::decoder,   {0xF5},         {},     N },
+END_OPCODES()
+END_MNEMONIC()
+
+//TODO: Workaround. Actually, it's D_DU, but Jitrino's CG thinks it's D_U
+BEGIN_MNEMONIC(CDQ, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,       {0x99},         {DX, AX},       D_U },
+    {OpcodeInfo::all,       {0x99},         {EDX, EAX},     D_U },
+    {OpcodeInfo::em64t,     {REX_W, 0x99},  {RDX, RAX},     D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+#define DEFINE_CMOVcc_MNEMONIC( cc ) \
+        BEGIN_MNEMONIC(CMOV##cc, MF_USES_FLAGS|MF_CONDITIONAL, DU_U ) \
+BEGIN_OPCODES() \
+    {OpcodeInfo::all,   {Size16, 0x0F, 0x40 + ConditionMnemonic_##cc, _r},  {r16, r_m16},   DU_U }, \
+    {OpcodeInfo::all,   {0x0F, 0x40 + ConditionMnemonic_##cc, _r},          {r32, r_m32},   DU_U }, \
+    {OpcodeInfo::em64t, {REX_W, 0x0F, 0x40 + ConditionMnemonic_##cc, _r},   {r64, r_m64},   DU_U }, \
+END_OPCODES() \
+END_MNEMONIC()
+
+DEFINE_CMOVcc_MNEMONIC(O)
+DEFINE_CMOVcc_MNEMONIC(NO)
+DEFINE_CMOVcc_MNEMONIC(B)
+DEFINE_CMOVcc_MNEMONIC(NB)
+DEFINE_CMOVcc_MNEMONIC(Z)
+DEFINE_CMOVcc_MNEMONIC(NZ)
+DEFINE_CMOVcc_MNEMONIC(BE)
+DEFINE_CMOVcc_MNEMONIC(NBE)
+DEFINE_CMOVcc_MNEMONIC(S)
+DEFINE_CMOVcc_MNEMONIC(NS)
+DEFINE_CMOVcc_MNEMONIC(P)
+DEFINE_CMOVcc_MNEMONIC(NP)
+DEFINE_CMOVcc_MNEMONIC(L)
+DEFINE_CMOVcc_MNEMONIC(NL)
+DEFINE_CMOVcc_MNEMONIC(LE)
+DEFINE_CMOVcc_MNEMONIC(NLE)
+
+#undef DEFINE_CMOVcc_MNEMONIC
+
+/*****************************************************************************
+                                ***** SSE conversion routines *****
+*****************************************************************************/
+//
+// double -> float
+BEGIN_MNEMONIC(CVTSD2SS, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF2, 0x0F, 0x5A, _r},   {xmm32, xmm_m64}, D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+// double -> I_32
+BEGIN_MNEMONIC(CVTSD2SI, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF2, 0x0F, 0x2D, _r},         {r32, xmm_m64}, D_U },
+    {OpcodeInfo::em64t, {REX_W, 0xF2, 0x0F, 0x2D, _r},  {r64, xmm_m64}, D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+// double [truncated] -> I_32
+BEGIN_MNEMONIC(CVTTSD2SI, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF2, 0x0F, 0x2C, _r},         {r32, xmm_m64}, D_U },
+    {OpcodeInfo::em64t, {REX_W, 0xF2, 0x0F, 0x2C, _r},  {r64, xmm_m64}, D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+// float -> double
+BEGIN_MNEMONIC(CVTSS2SD, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x5A, _r},         {xmm64, xmm_m32}, D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+// float -> I_32
+BEGIN_MNEMONIC(CVTSS2SI, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x2D, _r},         {r32, xmm_m32}, D_U},
+    {OpcodeInfo::em64t, {REX_W, 0xF3, 0x0F, 0x2D, _r},  {r64, xmm_m32}, D_U},
+END_OPCODES()
+END_MNEMONIC()
+
+// float [truncated] -> I_32
+BEGIN_MNEMONIC(CVTTSS2SI, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x2C, _r},         {r32, xmm_m32}, D_U},
+    {OpcodeInfo::em64t, {REX_W, 0xF3, 0x0F, 0x2C, _r},  {r64, xmm_m32}, D_U},
+END_OPCODES()
+END_MNEMONIC()
+
+// I_32 -> double
+BEGIN_MNEMONIC(CVTSI2SD, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF2, 0x0F, 0x2A, _r},         {xmm64, r_m32}, D_U},
+    {OpcodeInfo::em64t, {REX_W, 0xF2, 0x0F, 0x2A, _r},  {xmm64, r_m64}, D_U},
+END_OPCODES()
+END_MNEMONIC()
+
+// I_32 -> float
+BEGIN_MNEMONIC(CVTSI2SS, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x2A, _r},         {xmm32, r_m32}, D_U},
+    {OpcodeInfo::em64t, {REX_W, 0xF3, 0x0F, 0x2A, _r},  {xmm32, r_m64}, D_U},
+END_OPCODES()
+END_MNEMONIC()
+
+//
+// ~ SSE conversions
+//
+
+BEGIN_MNEMONIC(DEC, MF_AFFECTS_FLAGS, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xFE, _1},         {r_m8},     DU },
+
+    {OpcodeInfo::all,   {Size16, 0xFF, _1}, {r_m16},    DU },
+    {OpcodeInfo::all,   {0xFF, _1},         {r_m32},    DU },
+    {OpcodeInfo::em64t, {REX_W, 0xFF, _1},  {r_m64},    DU },
+
+    {OpcodeInfo::ia32,  {Size16, 0x48|rw},  {r16},      DU },
+    {OpcodeInfo::ia32,  {0x48|rd},          {r32},      DU },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(DIVSD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0xF2, 0x0F, 0x5E, _r},   {xmm64, xmm_m64},   DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(DIVSS, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0xF3, 0x0F, 0x5E, _r},   {xmm32, xmm_m32},   DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+/****************************************************************************
+                 ***** FPU operations *****
+****************************************************************************/
+
+BEGIN_MNEMONIC(FADDP, MF_NONE, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDE, 0xC1},       {FP0D}, DU },
+    {OpcodeInfo::all,   {0xDE, 0xC1},       {FP0S}, DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FLDZ,  MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, 0xEE},   {FP0D}, D },
+    {OpcodeInfo::all,   {0xD9, 0xEE},   {FP0S}, D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FADD,  MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDC, _0},     {FP0D, m64}, DU_U },
+    {OpcodeInfo::all,   {0xD8, _0},     {FP0S, m32}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FSUBP, MF_NONE, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDE, 0xE9},   {FP0D}, DU },
+    {OpcodeInfo::all,   {0xDE, 0xE9},   {FP0S}, DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FSUB,   MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDC, _4},     {FP0D, m64}, DU_U },
+    {OpcodeInfo::all,   {0xD8, _4},     {FP0S, m32}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FISUB,   MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDA, _4},       {FP0S, m32}, DU_U },
+//    {OpcodeInfo::all,   {0xDE, _4},       {FP0S, m16}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+
+BEGIN_MNEMONIC(FMUL,   MF_NONE, DU_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD8, _1},     {FP0S, m32}, DU_U },
+    {OpcodeInfo::all,   {0xDC, _1},     {FP0D, m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FMULP, MF_NONE, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDE, 0xC9},   {FP0D}, DU },
+    {OpcodeInfo::all,   {0xDE, 0xC9},   {FP0S}, DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FDIVP, MF_NONE, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDE, 0xF9},   {FP0D}, DU },
+    {OpcodeInfo::all,   {0xDE, 0xF9},   {FP0S}, DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FDIV,   MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDC, _6},     {FP0D, m64}, DU_U },
+    {OpcodeInfo::all,   {0xD8, _6},     {FP0S, m32}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(FUCOM, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDD, 0xE1},         {FP0D, FP1D},    DU_U },
+    {OpcodeInfo::all,   {0xDD, 0xE1},         {FP0S, FP1S},    DU_U },
+    // A little trick: actually, these 2 opcodes take only index of the
+    // needed register. To make the things similar to other instructions
+    // we encode here as if they took FPREG.
+    {OpcodeInfo::all,   {0xDD, 0xE0|_i},    {fp32},         DU },
+    {OpcodeInfo::all,   {0xDD, 0xE0|_i},    {fp64},         DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FUCOMI, MF_NONE, D_U )
+BEGIN_OPCODES()
+    // A little trick: actually, these 2 opcodes take only index of the
+    // needed register. To make the things similar to other instructions
+    // we encode here as if they took FPREG.
+    {OpcodeInfo::all,   {0xDB, 0xE8|_i},    {fp32},         DU },
+    {OpcodeInfo::all,   {0xDB, 0xE8|_i},    {fp64},         DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FUCOMP, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDD, 0xE9},             {FP0D, FP1D},    DU_U },
+    {OpcodeInfo::all,   {0xDD, 0xE9},             {FP0S, FP1S},    DU_U },
+    // A little trick: actually, these 2 opcodes take only index of the
+    // needed register. To make the things similar to other instructions
+    // we encode here as if they took FPREG.
+    {OpcodeInfo::all,   {0xDD, 0xE8|_i},        {fp32},         DU },
+    {OpcodeInfo::all,   {0xDD, 0xE8|_i},        {fp64},         DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FUCOMIP, MF_NONE, D_U )
+BEGIN_OPCODES()
+    // A little trick: actually, these 2 opcodes take only index of the
+    // needed register. To make the things similar to other instructions
+    // we encode here as if they took FPREG.
+    {OpcodeInfo::all,   {0xDF, 0xE8|_i},        {fp32},         DU },
+    {OpcodeInfo::all,   {0xDF, 0xE8|_i},        {fp64},         DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FUCOMPP, MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDA, 0xE9},   {FP0D, FP1D}, DU_U },
+    {OpcodeInfo::all,   {0xDA, 0xE9},   {FP0S, FP1S}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FLDCW, MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, _5},     {m16},  U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FNSTCW, MF_NONE, D)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, _7},     {m16},  D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FSTSW, MF_NONE, D)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x9B, 0xDF, 0xE0}, {EAX},  D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FNSTSW, MF_NONE, D)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDF, 0xE0},   {EAX},  D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FCHS, MF_NONE, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, 0xE0},   {FP0D}, DU },
+    {OpcodeInfo::all,   {0xD9, 0xE0},   {FP0S}, DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FCLEX, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x9B, 0xDB, 0xE2}, {}, N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FNCLEX, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDB, 0xE2},       {}, N },
+END_OPCODES()
+END_MNEMONIC()
+
+//BEGIN_MNEMONIC(FDECSTP, MF_NONE, N)
+//  BEGIN_OPCODES()
+//          {OpcodeInfo::all, {0xD9, 0xF6},       {},     N },
+//  END_OPCODES()
+//END_MNEMONIC()
+
+BEGIN_MNEMONIC(FILD, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDB, _0}, {FP0S, m32},    D_U },
+    {OpcodeInfo::all,   {0xDF, _5}, {FP0D, m64},    D_U },
+    {OpcodeInfo::all,   {0xDB, _0}, {FP0S, m32},    D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+//BEGIN_MNEMONIC(FINCSTP, MF_NONE, N)
+//  BEGIN_OPCODES()
+//          {OpcodeInfo::all, {0xD9, 0xF7},       {},     N },
+//  END_OPCODES()
+//END_MNEMONIC()
+
+BEGIN_MNEMONIC(FIST, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDB, _2}, {m32, FP0S},    D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FISTP, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDB, _3}, {m32, FP0S},    D_U },
+    {OpcodeInfo::all,   {0xDF, _7}, {m64, FP0D},    D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FISTTP, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xDD, _1}, {m64, FP0D},    D_U },
+    {OpcodeInfo::all,   {0xDB, _1}, {m32, FP0S},    D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FRNDINT, MF_NONE, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, 0xFC}, {FP0S},    DU },
+    {OpcodeInfo::all,   {0xD9, 0xFC}, {FP0D},    DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FLD, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, _0}, {FP0S, m32},    D_U },
+    {OpcodeInfo::all,   {0xDD, _0}, {FP0D, m64},    D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FLDLG2, MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, 0xEC}, {FP0S},    D },
+    {OpcodeInfo::all,   {0xD9, 0xEC}, {FP0D},    D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FLDLN2, MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, 0xED}, {FP0S},    D },
+    {OpcodeInfo::all,   {0xD9, 0xED}, {FP0D},    D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FLD1, MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, 0xE8}, {FP0S},    D },
+    {OpcodeInfo::all,   {0xD9, 0xE8}, {FP0D},    D },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(FPREM, MF_NONE, N)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xF8},       {},     N },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FPREM1, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, 0xF5},       {},     N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FST, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, _2},         {m32, FP0S},    D_U },
+    {OpcodeInfo::all,   {0xDD, _2},         {m64, FP0D},    D_U },
+    // A little trick: actually, these 2 opcodes take only index of the
+    // needed register. To make the things similar to other instructions
+    // we encode here as if they took FPREG.
+    {OpcodeInfo::all,   {0xDD, 0xD0|_i},    {fp32},         D },
+    {OpcodeInfo::all,   {0xDD, 0xD0|_i},    {fp64},         D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FSTP, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xD9, _3},             {m32, FP0S},    D_U },
+    {OpcodeInfo::all,   {0xDD, _3},             {m64, FP0D},    D_U },
+    // A little trick: actually, these 2 opcodes take only index of the
+    // needed register. To make the things similar to other instructions
+    // we encode here as if they took FPREG.
+    {OpcodeInfo::all,   {0xDD, 0xD8|_i},        {fp32},         D },
+    {OpcodeInfo::all,   {0xDD, 0xD8|_i},        {fp64},         D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FSQRT, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xFA},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xFA},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(FYL2X, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xF1},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xF1},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(FYL2XP1, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xF9},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xF9},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(F2XM1, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xF0},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xF0},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FPATAN, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xF3},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xF3},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FXCH, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xC9},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xC9},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FSCALE, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xFD},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xFD},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FABS, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xE1},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xE1},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FSIN, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xFE},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xFE},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FCOS, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xFF},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xFF},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(FPTAN, MF_NONE, DU)
+  BEGIN_OPCODES()
+          {OpcodeInfo::all, {0xD9, 0xF2},       {FP0S},     DU   },
+          {OpcodeInfo::all, {0xD9, 0xF2},       {FP0D},     DU   },
+  END_OPCODES()
+END_MNEMONIC()
+
+//
+// ~ FPU
+//
+
+BEGIN_MNEMONIC(DIV, MF_AFFECTS_FLAGS, DU_DU_U)
+BEGIN_OPCODES()
+#if !defined(_EM64T_)
+    {OpcodeInfo::all,   {0xF6, _6},         {AH, AL, r_m8},     DU_DU_U },
+    {OpcodeInfo::all,   {Size16, 0xF7, _6}, {DX, AX, r_m16},    DU_DU_U },
+#endif
+    {OpcodeInfo::all,   {0xF7, _6},         {EDX, EAX, r_m32},  DU_DU_U },
+    {OpcodeInfo::em64t, {REX_W, 0xF7, _6},  {RDX, RAX, r_m64},  DU_DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(IDIV, MF_AFFECTS_FLAGS, DU_DU_U)
+BEGIN_OPCODES()
+#if !defined(_EM64T_)
+    {OpcodeInfo::all,   {0xF6, _7},         {AH, AL, r_m8},     DU_DU_U },
+    {OpcodeInfo::all,   {Size16, 0xF7, _7}, {DX, AX, r_m16},    DU_DU_U },
+#endif
+    {OpcodeInfo::all,   {0xF7, _7},         {EDX, EAX, r_m32},  DU_DU_U },
+    {OpcodeInfo::em64t, {REX_W, 0xF7, _7},  {RDX, RAX, r_m64},  DU_DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(IMUL, MF_AFFECTS_FLAGS, D_DU_U)
+BEGIN_OPCODES()
+    /*{OpcodeInfo::all,   {0xF6, _5},               {AH, AL,        r_m8},  D_DU_U },
+    {OpcodeInfo::all,     {Size16, 0xF7, _5},       {DX, AX,        r_m16}, D_DU_U },
+    */
+    //
+    {OpcodeInfo::all,     {0xF7, _5},               {EDX, EAX, r_m32},  D_DU_U },
+    //todo: this opcode's hash conflicts with IMUL r64,r_m64 - they're both 0.
+    // this particular is not currently used, so we may safely drop it, but need to
+    // revisit the hash implementation
+    // {OpcodeInfo::em64t,   {REX_W, 0xF7, _5},        {RDX, RAX, r_m64},  D_DU_U },
+    //
+    {OpcodeInfo::all,   {Size16, 0x0F, 0xAF, _r}, {r16,r_m16},        DU_U },
+    {OpcodeInfo::all,   {0x0F, 0xAF, _r},         {r32,r_m32},        DU_U },
+    {OpcodeInfo::em64t, {REX_W, 0x0F, 0xAF, _r},  {r64,r_m64},        DU_U },
+    {OpcodeInfo::all,   {Size16, 0x6B, _r, ib},   {r16,r_m16,imm8s},  D_DU_U },
+    {OpcodeInfo::all,   {0x6B, _r, ib},           {r32,r_m32,imm8s},  D_DU_U },
+    {OpcodeInfo::em64t, {REX_W, 0x6B, _r, ib},    {r64,r_m64,imm8s},  D_DU_U },
+    {OpcodeInfo::all,   {Size16, 0x6B, _r, ib},   {r16,imm8s},        DU_U },
+    {OpcodeInfo::all,   {0x6B, _r, ib},           {r32,imm8s},        DU_U },
+    {OpcodeInfo::em64t, {REX_W, 0x6B, _r, ib},    {r64,imm8s},        DU_U },
+    {OpcodeInfo::all,   {Size16, 0x69, _r, iw},   {r16,r_m16,imm16},  D_U_U },
+    {OpcodeInfo::all,   {0x69, _r, id},           {r32,r_m32,imm32},  D_U_U },
+    {OpcodeInfo::em64t, {REX_W, 0x69, _r, id},    {r64,r_m64,imm32s}, D_U_U },
+    {OpcodeInfo::all,   {Size16, 0x69, _r, iw},   {r16,imm16},        DU_U },
+    {OpcodeInfo::all,   {0x69, _r, id},           {r32,imm32},        DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MUL, MF_AFFECTS_FLAGS, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF6, _4},           {AX, AL, r_m8},     D_DU_U },
+    {OpcodeInfo::all,   {Size16, 0xF7, _4},   {DX, AX, r_m16},    D_DU_U },
+    {OpcodeInfo::all,   {0xF7, _4},           {EDX, EAX, r_m32},  D_DU_U },
+    {OpcodeInfo::em64t, {REX_W, 0xF7, _4},    {RDX, RAX, r_m64},  D_DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(INC, MF_AFFECTS_FLAGS, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xFE, _0},           {r_m8},         DU },
+    {OpcodeInfo::all,   {Size16, 0xFF, _0},   {r_m16},        DU },
+    {OpcodeInfo::all,   {0xFF, _0},           {r_m32},        DU },
+    {OpcodeInfo::em64t, {REX_W, 0xFF, _0},    {r_m64},        DU },
+    {OpcodeInfo::ia32,  {Size16, 0x40|rw},    {r16},          DU },
+    {OpcodeInfo::ia32,  {0x40|rd},            {r32},          DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(INT3, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xCC},     {},     N },
+END_OPCODES()
+END_MNEMONIC()
+
+#define DEFINE_Jcc_MNEMONIC( cc ) \
+        BEGIN_MNEMONIC(J##cc, MF_USES_FLAGS|MF_CONDITIONAL, U ) \
+BEGIN_OPCODES() \
+    {OpcodeInfo::all,   {0x70 + ConditionMnemonic_##cc, cb },           { rel8 },       U }, \
+    {OpcodeInfo::ia32,  {Size16, 0x0F, 0x80 + ConditionMnemonic_##cc, cw},      { rel16 },      U }, \
+    {OpcodeInfo::all,   {0x0F, 0x80 + ConditionMnemonic_##cc, cd},      { rel32 },      U }, \
+END_OPCODES() \
+END_MNEMONIC()
+
+
+DEFINE_Jcc_MNEMONIC(O)
+DEFINE_Jcc_MNEMONIC(NO)
+DEFINE_Jcc_MNEMONIC(B)
+DEFINE_Jcc_MNEMONIC(NB)
+DEFINE_Jcc_MNEMONIC(Z)
+DEFINE_Jcc_MNEMONIC(NZ)
+DEFINE_Jcc_MNEMONIC(BE)
+DEFINE_Jcc_MNEMONIC(NBE)
+
+DEFINE_Jcc_MNEMONIC(S)
+DEFINE_Jcc_MNEMONIC(NS)
+DEFINE_Jcc_MNEMONIC(P)
+DEFINE_Jcc_MNEMONIC(NP)
+DEFINE_Jcc_MNEMONIC(L)
+DEFINE_Jcc_MNEMONIC(NL)
+DEFINE_Jcc_MNEMONIC(LE)
+DEFINE_Jcc_MNEMONIC(NLE)
+
+#undef DEFINE_Jcc_MNEMONIC
+
+BEGIN_MNEMONIC(JMP, MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xEB, cb},         {rel8},     U },
+    {OpcodeInfo::ia32,  {Size16, 0xE9, cw}, {rel16},    U },
+    {OpcodeInfo::all,   {0xE9, cd},         {rel32},    U },
+    {OpcodeInfo::ia32,  {Size16, 0xFF, _4}, {r_m16},    U },
+    {OpcodeInfo::ia32,  {0xFF, _4},         {r_m32},    U },
+    {OpcodeInfo::em64t, {0xFF, _4},         {r_m64},    U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(LEA, MF_NONE, D_U )
+BEGIN_OPCODES()
+    /*
+    A special case: the LEA instruction itself does not care about size of
+    second operand. This is obviuos why it is, and thus in The Manual, a
+    simple 'm' without size is used.
+    However, in the Jitrino's instrucitons we'll have an operand with a size.
+    Also, the hashing scheme is not supposed to handle OpndSize_Null, and
+    making it to do so will lead to unnecessary complication of hashing
+    scheme. Thus, instead of handling it as a special case, we simply make
+    copies of the opcodes with sizes set.
+        {OpcodeInfo::all,     {0x8D, _r},             {r32, m},       D_U },
+        {OpcodeInfo::em64t, {0x8D, _r},               {r64, m},       D_U },
+    */
+    //Android x86: keep r32, m32 only, otherwise, will have decoding error
+    //{OpcodeInfo::all,   {0x8D, _r},     {r32, m8},      D_U },
+    {OpcodeInfo::em64t, {REX_W, 0x8D, _r},     {r64, m8},      D_U },
+    //{OpcodeInfo::all,   {0x8D, _r},     {r32, m16},     D_U },
+    {OpcodeInfo::em64t, {REX_W, 0x8D, _r},     {r64, m16},     D_U },
+    {OpcodeInfo::all,   {0x8D, _r},     {r32, m32},     D_U },
+    {OpcodeInfo::em64t, {REX_W, 0x8D, _r},     {r64, m32},     D_U },
+    {OpcodeInfo::all,   {0x8D, _r},     {r32, m64},     D_U },
+    {OpcodeInfo::em64t, {REX_W, 0x8D, _r},     {r64, m64},     D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(LOOP, MF_AFFECTS_FLAGS|MF_USES_FLAGS, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xE2, cb},     {ECX, rel8},    DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(LOOPE, MF_AFFECTS_FLAGS|MF_USES_FLAGS, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xE1, cb},     {ECX, rel8},    DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(LOOPNE, MF_AFFECTS_FLAGS|MF_USES_FLAGS, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xE0, cb},     {ECX, rel8},    DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOV, MF_NONE, D_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x88, _r},         {r_m8,r8},      D_U },
+
+    {OpcodeInfo::all,   {Size16, 0x89, _r}, {r_m16,r16},    D_U },
+    {OpcodeInfo::all,   {0x89, _r},         {r_m32,r32},    D_U },
+    {OpcodeInfo::em64t, {REX_W, 0x89, _r},  {r_m64,r64},    D_U },
+    {OpcodeInfo::all,   {0x8A, _r},         {r8,r_m8},      D_U },
+
+    {OpcodeInfo::all,   {Size16, 0x8B, _r}, {r16,r_m16},    D_U },
+    {OpcodeInfo::all,   {0x8B, _r},         {r32,r_m32},    D_U },
+    {OpcodeInfo::em64t, {REX_W, 0x8B, _r},  {r64,r_m64},    D_U },
+
+    {OpcodeInfo::all,   {0xB0|rb},          {r8,imm8},      D_U },
+
+    {OpcodeInfo::all,   {Size16, 0xB8|rw},  {r16,imm16},    D_U },
+    {OpcodeInfo::all,   {0xB8|rd},          {r32,imm32},    D_U },
+    {OpcodeInfo::em64t, {REX_W, 0xB8|rd},   {r64,imm64},    D_U },
+    {OpcodeInfo::all,   {0xC6, _0},         {r_m8,imm8},    D_U },
+
+    {OpcodeInfo::all,   {Size16, 0xC7, _0}, {r_m16,imm16},  D_U },
+    {OpcodeInfo::all,   {0xC7, _0},         {r_m32,imm32},  D_U },
+    {OpcodeInfo::em64t, {REX_W, 0xC7, _0},  {r_m64,imm32s}, D_U },
+
+    {OpcodeInfo::decoder,   {0xA0},         {AL,  moff8},  D_U },
+    {OpcodeInfo::decoder,   {Size16, 0xA1}, {AX,  moff16},  D_U },
+    {OpcodeInfo::decoder,   {0xA1},         {EAX, moff32},  D_U },
+    //{OpcodeInfo::decoder64,   {REX_W, 0xA1},  {RAX, moff64},  D_U },
+
+    {OpcodeInfo::decoder,   {0xA2},         {moff8, AL},  D_U },
+    {OpcodeInfo::decoder,   {Size16, 0xA3}, {moff16, AX},  D_U },
+    {OpcodeInfo::decoder,   {0xA3},         {moff32, EAX},  D_U },
+    //{OpcodeInfo::decoder64,   {REX_W, 0xA3},  {moff64, RAX},  D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+
+BEGIN_MNEMONIC(XCHG, MF_NONE, DU_DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x87, _r},   {r_m32,r32},    DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(MOVQ, MF_NONE, D_U )
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0x6F, _r},   {mm64, mm_m64}, D_U },
+    {OpcodeInfo::all,   {0x0F, 0x7F, _r},   {mm_m64, mm64}, D_U },
+#endif
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x7E },  {xmm64, xmm_m64},       D_U },
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xD6 },  {xmm_m64, xmm64},       D_U },
+//    {OpcodeInfo::em64t, {REX_W, 0x66, 0x0F, 0x6E, _r},  {xmm64, r_m64}, D_U },
+//    {OpcodeInfo::em64t, {REX_W, 0x66, 0x0F, 0x7E, _r},  {r_m64, xmm64}, D_U },
+    {OpcodeInfo::em64t, {REX_W, 0x66, 0x0F, 0x6E, _r},  {xmm64, r64}, D_U },
+    {OpcodeInfo::em64t, {REX_W, 0x66, 0x0F, 0x7E, _r},  {r64, xmm64}, D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(MOVD, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x6E, _r}, {xmm32, r_m32}, D_U },
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x7E, _r}, {r_m32, xmm32}, D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+//
+// A bunch of MMX instructions
+//
+#ifdef _HAVE_MMX_
+
+BEGIN_MNEMONIC(EMMS, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0x77},       {},             N },
+END_OPCODES()
+END_MNEMONIC()
+
+#endif
+
+BEGIN_MNEMONIC(PADDQ, MF_NONE, DU_U)
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0xD4, _r},   {mm64, mm_m64}, DU_U },
+#endif
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xD4, _r},   {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PAND, MF_NONE, DU_U)
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0xDB, _r},   {mm64, mm_m64}, DU_U },
+#endif
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xDB, _r},   {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(POR, MF_NONE, DU_U)
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0xEB, _r},   {mm64, mm_m64}, DU_U },
+#endif
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xEB, _r},   {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSUBQ, MF_NONE, DU_U)
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0xFB, _r},   {mm64, mm_m64}, DU_U },
+#endif
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xFB, _r},   {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PANDN, MF_NONE, DU_U)
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0xDF, _r},   {mm64, mm_m64}, DU_U },
+#endif
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xDF, _r}, {xmm64, xmm_m64},   DU_U },
+END_OPCODES()
+END_MNEMONIC()
+BEGIN_MNEMONIC(PSLLQ, MF_NONE, DU_U)
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0xF3, _r},   {mm64, mm_m64}, DU_U },
+#endif
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xF3, _r}, {xmm64, xmm_m64},   DU_U },
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x73, _6, ib}, {xmm64, imm8},   DU_U },
+END_OPCODES()
+END_MNEMONIC()
+BEGIN_MNEMONIC(PSRLQ, MF_NONE, DU_U)
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0xD3, _r},   {mm64, mm_m64}, DU_U },
+#endif
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xD3, _r}, {xmm64, xmm_m64},   DU_U },
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x73, _2, ib}, {xmm64, imm8},   DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PXOR, MF_NONE, DU_U)
+BEGIN_OPCODES()
+#ifdef _HAVE_MMX_
+    {OpcodeInfo::all,   {0x0F, 0xEF, _r},   {mm64, mm_m64}, DU_U },
+#endif
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xEF, _r}, {xmm64, xmm_m64},   DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(MOVAPD, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x28, _r},   {xmm64, xmm_m64},   D_U },
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x29, _r},   {xmm_m64, xmm64},   D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOVAPS, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0x28, _r},   {xmm64, xmm_m64},   D_U },
+    {OpcodeInfo::all,   {0x0F, 0x29, _r},   {xmm_m64, xmm64},   D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(SHUFPS, MF_NONE, D_U_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x0F, 0xC6, _r, ib}, {xmm64, xmm_m64, imm8}, D_U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(MOVSD, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0xF2, 0x0F, 0x10, _r},   {xmm64, xmm_m64},   D_U },
+    {OpcodeInfo::all, {0xF2, 0x0F, 0x11, _r},   {xmm_m64, xmm64},   D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOVSS, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0xF3, 0x0F, 0x10, _r},   {xmm32, xmm_m32}, D_U },
+    {OpcodeInfo::all, {0xF3, 0x0F, 0x11, _r},   {xmm_m32, xmm32}, D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOVSX, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,       {Size16, 0x0F, 0xBE, _r}, {r16, r_m8s},     D_U },
+    {OpcodeInfo::all,       {0x0F, 0xBE, _r},         {r32, r_m8s},     D_U },
+    {OpcodeInfo::em64t,     {REX_W, 0x0F, 0xBE, _r},  {r64, r_m8s},     D_U },
+
+    {OpcodeInfo::all,       {0x0F, 0xBF, _r},         {r32, r_m16s},    D_U },
+    {OpcodeInfo::em64t,     {REX_W, 0x0F, 0xBF, _r},  {r64, r_m16s},    D_U },
+
+    {OpcodeInfo::em64t,     {REX_W, 0x63, _r},        {r64, r_m32s},    D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOVZX, MF_NONE, D_U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,       {Size16, 0x0F, 0xB6, _r}, {r16, r_m8u},     D_U },
+    {OpcodeInfo::all,       {0x0F, 0xB6, _r},         {r32, r_m8u},     D_U },
+    {OpcodeInfo::em64t,     {REX_W, 0x0F, 0xB6, _r},  {r64, r_m8u},     D_U },
+
+    {OpcodeInfo::all,       {0x0F, 0xB7, _r},         {r32, r_m16u},    D_U },
+    {OpcodeInfo::em64t,     {REX_W, 0x0F, 0xB7, _r},  {r64, r_m16u},    D_U },
+    //workaround to get r/rm32->r64 ZX mov functionality:
+    //simple 32bit reg copying zeros high bits in 64bit reg
+    {OpcodeInfo::em64t,     {0x8B, _r},               {r64, r_m32u},    D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MULSD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF2, 0x0F, 0x59, _r}, {xmm64, xmm_m64},   DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MULSS, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x59, _r}, {xmm32, xmm_m32}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(NEG, MF_AFFECTS_FLAGS, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF6, _3},         {r_m8},         DU },
+
+    {OpcodeInfo::all,   {Size16, 0xF7, _3}, {r_m16},        DU },
+    {OpcodeInfo::all,   {0xF7, _3},         {r_m32},        DU },
+    {OpcodeInfo::em64t, {REX_W, 0xF7, _3},  {r_m64},        DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(NOP, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x90}, {},     N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(NOT, MF_AFFECTS_FLAGS, DU )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF6, _2},           {r_m8},         DU },
+    {OpcodeInfo::all,   {Size16, 0xF7, _2},   {r_m16},        DU },
+    {OpcodeInfo::all,   {0xF7, _2},           {r_m32},        DU },
+    {OpcodeInfo::em64t, {REX_W, 0xF7, _2},    {r_m64},        DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(POP, MF_NONE, D)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {Size16, 0x8F, _0}, {r_m16},    D },
+    {OpcodeInfo::ia32,  {0x8F, _0},         {r_m32},    D },
+    {OpcodeInfo::em64t, {0x8F, _0},         {r_m64},    D },
+
+    {OpcodeInfo::all,   {Size16, 0x58|rw }, {r16},      D },
+    {OpcodeInfo::ia32,  {0x58|rd },         {r32},      D },
+    {OpcodeInfo::em64t, {0x58|rd },         {r64},      D },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(POPFD, MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x9D},     {},         N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PREFETCH, MF_NONE, U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0x18, _0},   {m8},         U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PUSH, MF_NONE, U )
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {Size16, 0xFF, _6}, {r_m16},    U },
+    {OpcodeInfo::ia32,  {0xFF, _6},         {r_m32},    U },
+    {OpcodeInfo::em64t, {0xFF, _6},         {r_m64},    U },
+
+    {OpcodeInfo::all,   {Size16, 0x50|rw }, {r16},      U },
+    {OpcodeInfo::ia32,  {0x50|rd },         {r32},      U },
+    {OpcodeInfo::em64t, {0x50|rd },         {r64},      U },
+
+    {OpcodeInfo::all,   {0x6A},         {imm8},     U },
+    {OpcodeInfo::all,   {Size16, 0x68}, {imm16},    U },
+    {OpcodeInfo::ia32,  {0x68},         {imm32},    U },
+//          {OpcodeInfo::em64t,   {0x68},   {imm64},    U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PUSHFD, MF_USES_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x9C},             {},        N },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(RET, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xC3},       {},         N },
+    {OpcodeInfo::all,   {0xC2, iw},   {imm16},    U },
+END_OPCODES()
+END_MNEMONIC()
+
+#define DEFINE_SETcc_MNEMONIC( cc ) \
+        BEGIN_MNEMONIC(SET##cc, MF_USES_FLAGS|MF_CONDITIONAL, DU) \
+BEGIN_OPCODES() \
+    {OpcodeInfo::all,   {0x0F,     0x90 + ConditionMnemonic_##cc}, {r_m8},  DU }, \
+END_OPCODES() \
+END_MNEMONIC()
+
+DEFINE_SETcc_MNEMONIC(O)
+DEFINE_SETcc_MNEMONIC(NO)
+DEFINE_SETcc_MNEMONIC(B)
+DEFINE_SETcc_MNEMONIC(NB)
+DEFINE_SETcc_MNEMONIC(Z)
+DEFINE_SETcc_MNEMONIC(NZ)
+DEFINE_SETcc_MNEMONIC(BE)
+DEFINE_SETcc_MNEMONIC(NBE)
+
+DEFINE_SETcc_MNEMONIC(S)
+DEFINE_SETcc_MNEMONIC(NS)
+DEFINE_SETcc_MNEMONIC(P)
+DEFINE_SETcc_MNEMONIC(NP)
+DEFINE_SETcc_MNEMONIC(L)
+DEFINE_SETcc_MNEMONIC(NL)
+DEFINE_SETcc_MNEMONIC(LE)
+DEFINE_SETcc_MNEMONIC(NLE)
+
+#undef DEFINE_SETcc_MNEMONIC
+
+#define DEFINE_SHIFT_MNEMONIC(nam, slash_num, flags) \
+BEGIN_MNEMONIC(nam, flags, DU_U) \
+BEGIN_OPCODES()\
+    /* D0 & D1 opcodes are added w/o 2nd operand (1) because */\
+    /* they are used for decoding only so only instruction length is needed */\
+    {OpcodeInfo::decoder,   {0xD0, slash_num},            {r_m8/*,const_1*/},   DU },\
+    {OpcodeInfo::all,       {0xD2, slash_num},              {r_m8,  CL},        DU_U },\
+    {OpcodeInfo::all,       {0xC0, slash_num, ib},          {r_m8,  imm8},      DU_U },\
+\
+    {OpcodeInfo::decoder,   {Size16, 0xD1, slash_num},    {r_m16/*,const_1*/},  DU },\
+    {OpcodeInfo::all,       {Size16, 0xD3, slash_num},      {r_m16, CL},        DU_U },\
+    {OpcodeInfo::all,       {Size16, 0xC1, slash_num, ib},  {r_m16, imm8 },     DU_U },\
+\
+    {OpcodeInfo::decoder,   {0xD1, slash_num},              {r_m32/*,const_1*/}, DU },\
+    {OpcodeInfo::decoder64, {REX_W, 0xD1, slash_num},       {r_m64/*,const_1*/}, DU },\
+\
+    {OpcodeInfo::all,       {0xD3, slash_num},              {r_m32, CL},        DU_U },\
+    {OpcodeInfo::em64t,     {REX_W, 0xD3, slash_num},       {r_m64, CL},        DU_U },\
+\
+    {OpcodeInfo::all,       {0xC1, slash_num, ib},          {r_m32, imm8},      DU_U },\
+    {OpcodeInfo::em64t,     {REX_W, 0xC1, slash_num, ib},   {r_m64, imm8},      DU_U },\
+END_OPCODES()\
+END_MNEMONIC()
+
+
+DEFINE_SHIFT_MNEMONIC(ROL, _0, MF_AFFECTS_FLAGS)
+DEFINE_SHIFT_MNEMONIC(ROR, _1, MF_AFFECTS_FLAGS)
+DEFINE_SHIFT_MNEMONIC(RCL, _2, MF_AFFECTS_FLAGS|MF_USES_FLAGS)
+DEFINE_SHIFT_MNEMONIC(RCR, _3, MF_AFFECTS_FLAGS|MF_USES_FLAGS)
+
+DEFINE_SHIFT_MNEMONIC(SAL, _4, MF_AFFECTS_FLAGS)
+DEFINE_SHIFT_MNEMONIC(SHR, _5, MF_AFFECTS_FLAGS)
+DEFINE_SHIFT_MNEMONIC(SAR, _7, MF_AFFECTS_FLAGS)
+
+#undef DEFINE_SHIFT_MNEMONIC
+
+BEGIN_MNEMONIC(SHLD, MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0xA5},   {r_m32, r32, CL}, DU_DU_U },
+    {OpcodeInfo::all,   {0x0F, 0xA4},   {r_m32, r32, imm8}, DU_DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(SHRD, MF_AFFECTS_FLAGS, N)
+// TODO: the def/use info is wrong
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0xAD},   {r_m32, r32, CL}, DU_DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(SUBSD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF2, 0x0F, 0x5C, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(SUBSS, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x5C, _r}, {xmm32, xmm_m32}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(TEST, MF_AFFECTS_FLAGS, U_U)
+BEGIN_OPCODES()
+
+    {OpcodeInfo::decoder,   {0xA8, ib},             { AL, imm8},    U_U },
+    {OpcodeInfo::decoder,   {0xA9, iw},             { AX, imm16},   U_U },
+    {OpcodeInfo::decoder,   {0xA9, id},             { EAX, imm32},  U_U },
+    {OpcodeInfo::decoder64, {REX_W, 0xA9, id},      { RAX, imm32s}, U_U },
+
+    {OpcodeInfo::all,       {0xF6, _0, ib},         {r_m8,imm8},    U_U },
+
+    {OpcodeInfo::all,       {Size16, 0xF7, _0, iw}, {r_m16,imm16},  U_U },
+    {OpcodeInfo::all,       {0xF7, _0, id},         {r_m32,imm32},  U_U },
+    {OpcodeInfo::em64t,     {REX_W, 0xF7, _0, id},  {r_m64,imm32s}, U_U },
+
+    {OpcodeInfo::all,       {0x84, _r},             {r_m8,r8},      U_U },
+
+    {OpcodeInfo::all,       {Size16, 0x85, _r},     {r_m16,r16},    U_U },
+    {OpcodeInfo::all,       {0x85, _r},             {r_m32,r32},    U_U },
+    {OpcodeInfo::em64t,     {REX_W, 0x85, _r},      {r_m64,r64},    U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(UCOMISD, MF_AFFECTS_FLAGS, U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x2E, _r}, {xmm64, xmm_m64}, U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(UCOMISS, MF_AFFECTS_FLAGS, U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0x2E, _r},       {xmm32, xmm_m32}, U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(COMISD, MF_AFFECTS_FLAGS, U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x2F, _r}, {xmm64, xmm_m64}, U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(COMISS, MF_AFFECTS_FLAGS, U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x0F, 0x2F, _r},       {xmm32, xmm_m32}, U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(XORPD, MF_SAME_ARG_NO_USE|MF_SYMMETRIC, DU_U)
+BEGIN_OPCODES()
+    //Note: they're actually 128 bits
+    {OpcodeInfo::all,   {0x66, 0x0F, 0x57, _r},   {xmm64, xmm_m64},   DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(XORPS, MF_SAME_ARG_NO_USE|MF_SYMMETRIC, DU_U)
+BEGIN_OPCODES()
+    //Note: they're actually 128 bits
+    {OpcodeInfo::all,   {0x0F, 0x57, _r},   {xmm32, xmm_m32},       DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CVTDQ2PD, MF_NONE, D_U )
+BEGIN_OPCODES()
+    //Note: they're actually 128 bits
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0xE6}, {xmm64, xmm_m64},   D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CVTDQ2PS, MF_NONE, D_U )
+BEGIN_OPCODES()
+    //Note: they're actually 128 bits
+    {OpcodeInfo::all,   {0x0F, 0x5B, _r},   {xmm32, xmm_m32},   D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CVTTPD2DQ, MF_NONE, D_U )
+BEGIN_OPCODES()
+    //Note: they're actually 128 bits
+    {OpcodeInfo::all,   {0x66, 0x0F, 0xE6}, {xmm64, xmm_m64},   D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CVTTPS2DQ, MF_NONE, D_U )
+BEGIN_OPCODES()
+    //Note: they're actually 128 bits
+    {OpcodeInfo::all,   {0xF3, 0x0F, 0x5B, _r},   {xmm32, xmm_m32},   D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+//
+// String operations
+//
+BEGIN_MNEMONIC(STD, MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xFD},         {},     N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CLD, MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xFC},         {},     N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(SCAS, MF_AFFECTS_FLAGS, N)
+// to be symmetric, this mnemonic must have either m32 or RegName_EAX
+// but as long, as Jitrino's CG does not use the mnemonic, leaving it
+// in its natural form
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xAF},         {},     N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(STOS, MF_AFFECTS_FLAGS, DU_DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0xAB},         {EDI, ECX, EAX},   DU_DU_U },
+    {OpcodeInfo::all,   {0xAA},         {EDI, ECX, AL},    DU_DU_U },
+    {OpcodeInfo::em64t, {REX_W, 0xAB},  {RDI, RCX, RAX},   DU_DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+/*
+MOVS and CMPS are the special cases.
+Most the code in both CG and Encoder do not expect 2 memory operands.
+Also, they are not supposed to setup constrains on which register the
+memory reference must reside - m8,m8 or m32,m32 is not the choice.
+We can't use r8,r8 either - will have problem with 8bit EDI, ESI.
+So, as the workaround we do r32,r32 and specify size of the operand through
+the specific mnemonic - the same is in the codegen.
+*/
+BEGIN_MNEMONIC(MOVS8, MF_NONE, DU_DU_DU)
+BEGIN_OPCODES()
+    {OpcodeInfo::ia32,  {0xA4},         {r32,r32,ECX},    DU_DU_DU },
+    {OpcodeInfo::em64t, {0xA4},         {r64,r64,RCX},    DU_DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOVS16, MF_NONE, DU_DU_DU)
+BEGIN_OPCODES()
+    {OpcodeInfo::ia32,  {Size16, 0xA5}, {r32,r32,ECX},  DU_DU_DU },
+    {OpcodeInfo::em64t, {Size16, 0xA5}, {r64,r64,RCX},  DU_DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOVS32, MF_NONE, DU_DU_DU)
+BEGIN_OPCODES()
+    {OpcodeInfo::ia32,  {0xA5},         {r32,r32,ECX},  DU_DU_DU },
+    {OpcodeInfo::em64t, {0xA5},         {r64,r64,RCX},  DU_DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOVS64, MF_NONE, DU_DU_DU)
+BEGIN_OPCODES()
+    {OpcodeInfo::em64t, {REX_W,0xA5},   {r64,r64,RCX},  DU_DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CMPSB, MF_AFFECTS_FLAGS, DU_DU_DU)
+BEGIN_OPCODES()
+    {OpcodeInfo::ia32,  {0xA6},         {ESI,EDI,ECX},    DU_DU_DU },
+    {OpcodeInfo::em64t, {0xA6},         {RSI,RDI,RCX},    DU_DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CMPSW, MF_AFFECTS_FLAGS, DU_DU_DU)
+BEGIN_OPCODES()
+    {OpcodeInfo::ia32,  {Size16, 0xA7}, {ESI,EDI,ECX},  DU_DU_DU },
+    {OpcodeInfo::em64t, {Size16, 0xA7}, {RSI,RDI,RCX},  DU_DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(CMPSD, MF_AFFECTS_FLAGS, DU_DU_DU)
+BEGIN_OPCODES()
+    {OpcodeInfo::ia32,  {0xA7},         {ESI,EDI,ECX},  DU_DU_DU },
+    {OpcodeInfo::em64t, {0xA7},         {RSI,RDI,RCX},  DU_DU_DU },
+END_OPCODES()
+END_MNEMONIC()
+
+
+BEGIN_MNEMONIC(WAIT, MF_AFFECTS_FLAGS, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::all,   {0x9B},         {},       N },
+END_OPCODES()
+END_MNEMONIC()
+
+//
+// ~String operations
+//
+
+//
+//Note: the instructions below added for the sake of disassembling routine.
+// They need to have flags, params and params usage to be defined more precisely.
+//
+BEGIN_MNEMONIC(LEAVE, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::decoder,   {0xC9},         {},       N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(ENTER, MF_NONE, N)
+BEGIN_OPCODES()
+    {OpcodeInfo::decoder,   {0xC8, iw, ib},           {imm16, imm8},  N },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PADDB, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xFC, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PADDW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xFD, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PADDD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xFE, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSUBB, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xF8, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSUBW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xF9, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSUBD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xFA, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PMULLW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xD5, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PMULLD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x40, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSLLW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xF1, _r}, {xmm64, xmm_m64}, DU_U },
+    {OpcodeInfo::all, {0x66, 0x0F, 0x71, _6, ib}, {xmm64, imm8}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSLLD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xF2, _r}, {xmm64, xmm_m64}, DU_U },
+    {OpcodeInfo::all, {0x66, 0x0F, 0x72, _6, ib}, {xmm64, imm8}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSRAW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xE1, _r}, {xmm64, xmm_m64}, DU_U },
+    {OpcodeInfo::all, {0x66, 0x0F, 0x71, _4, ib}, {xmm64, imm8}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSRAD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xE2, _r}, {xmm64, xmm_m64}, DU_U },
+    {OpcodeInfo::all, {0x66, 0x0F, 0x72, _4, ib}, {xmm64, imm8}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSRLW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xD1, _r}, {xmm64, xmm_m64}, DU_U },
+    {OpcodeInfo::all, {0x66, 0x0F, 0x71, _2, ib}, {xmm64, imm8}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSRLD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xD2, _r}, {xmm64, xmm_m64}, DU_U },
+    {OpcodeInfo::all, {0x66, 0x0F, 0x72, _2, ib}, {xmm64, imm8}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PMOVSXBW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x20, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSHUFB, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x00, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSHUFD, MF_NONE, D_U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x70, _r, ib}, {xmm64, xmm_m64, imm8}, D_U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSHUFLW, MF_NONE, D_U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0xF2, 0x0F, 0x70, _r, ib}, {xmm64, xmm_m64, imm8}, D_U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PSHUFHW, MF_NONE, D_U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0xF3, 0x0F, 0x70, _r, ib}, {xmm64, xmm_m64, imm8}, D_U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PHADDSW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x03, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PHADDW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x01, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PHADDD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x02, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PHSUBSW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x07, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PHSUBW, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x05, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PHSUBD, MF_NONE, DU_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x38, 0x06, _r}, {xmm64, xmm_m64}, DU_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PEXTRB, MF_NONE, D_U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x3A, 0x14, _r, ib}, {r32, xmm64, imm8}, D_U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PEXTRW, MF_NONE, D_U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0xC5, _r, ib}, {r32, xmm64, imm8}, D_U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(PEXTRD, MF_NONE, D_U_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x3A, 0x16, _r, ib}, {r_m32, xmm64, imm8}, D_U_U },
+END_OPCODES()
+END_MNEMONIC()
+
+BEGIN_MNEMONIC(MOVDQA, MF_NONE|MF_SYMMETRIC, D_U)
+BEGIN_OPCODES()
+    {OpcodeInfo::all, {0x66, 0x0F, 0x6F, _r}, {xmm64, xmm_m64}, D_U },
+    //The encoder cannot properly look up when operands are symmetric but opcode is not:
+    //{OpcodeInfo::all, {0x66, 0x0F, 0x7F, _r}, {xmm_m128, xmm128}, D_U },
+END_OPCODES()
+END_MNEMONIC()
+
+};      // ~masterEncodingTable[]
+
+ENCODER_NAMESPACE_END
+
+ENCODER_NAMESPACE_START
+
+static int compareMnemonicInfo(const void* info1, const void* info2)
+{
+    Mnemonic id1, id2;
+
+    id1 = ((const MnemonicInfo*) info1)->mn;
+    id2 = ((const MnemonicInfo*) info2)->mn;
+    if (id1 < id2)
+        return -1;
+    if (id1 > id2)
+        return 1;
+    return 0;
+}
+
+int EncoderBase::buildTable(void)
+{
+    // A check: all mnemonics must be covered
+    assert(COUNTOF(masterEncodingTable) == Mnemonic_Count);
+
+    // sort out the mnemonics so the list become ordered
+    qsort(masterEncodingTable, Mnemonic_Count, sizeof(MnemonicInfo), compareMnemonicInfo);
+
+    //
+    // clear the things
+    //
+    memset(opcodesHashMap, NOHASH, sizeof(opcodesHashMap));
+    memset(opcodes, 0, sizeof(opcodes));
+    //
+    // and, finally, build it
+    for (unsigned i=0; i<Mnemonic_Count; i++) {
+        assert((Mnemonic)i == (masterEncodingTable + i)->mn);
+        buildMnemonicDesc(masterEncodingTable+i);
+    }
+    return 0;
+}
+
+void EncoderBase::buildMnemonicDesc(const MnemonicInfo * minfo)
+{
+    MnemonicDesc& mdesc = mnemonics[minfo->mn];
+    mdesc.mn = minfo->mn;
+    mdesc.flags = minfo->flags;
+    mdesc.roles = minfo->roles;
+    mdesc.name = minfo->name;
+
+    //
+    // fill the used opcodes
+    //
+    for (unsigned i=0, oindex=0; i<COUNTOF(minfo->opcodes); i++) {
+
+        const OpcodeInfo& oinfo = minfo->opcodes[i];
+        OpcodeDesc& odesc = opcodes[minfo->mn][oindex];
+        // last opcode ?
+        if (oinfo.opcode[0] == OpcodeByteKind_LAST) {
+            // mark the opcode 'last', exit
+            odesc.opcode_len = 0;
+            odesc.last = 1;
+            break;
+        }
+        odesc.last = 0;
+#ifdef _EM64T_
+        if (oinfo.platf == OpcodeInfo::ia32) { continue; }
+        if (oinfo.platf == OpcodeInfo::decoder32) { continue; }
+#else
+        if (oinfo.platf == OpcodeInfo::em64t) { continue; }
+        if (oinfo.platf == OpcodeInfo::decoder64) { continue; }
+#endif
+        if (oinfo.platf == OpcodeInfo::decoder64 ||
+            oinfo.platf == OpcodeInfo::decoder32) {
+             odesc.platf = OpcodeInfo::decoder;
+        }
+        else {
+            odesc.platf = (char)oinfo.platf;
+        }
+        //
+        // fill out opcodes
+        //
+        unsigned j = 0;
+        odesc.opcode_len = 0;
+        for(; oinfo.opcode[j]; j++) {
+            unsigned opcod = oinfo.opcode[j];
+            unsigned kind = opcod&OpcodeByteKind_KindMask;
+            if (kind == OpcodeByteKind_REX_W) {
+                odesc.opcode[odesc.opcode_len++] = (unsigned char)0x48;
+                continue;
+            }
+            else if(kind != 0 && kind != OpcodeByteKind_ZeroOpcodeByte) {
+                break;
+            }
+            unsigned lowByte = (opcod & OpcodeByteKind_OpcodeMask);
+            odesc.opcode[odesc.opcode_len++] = (unsigned char)lowByte;
+        }
+        assert(odesc.opcode_len<5);
+        odesc.aux0 = odesc.aux1 = 0;
+        if (oinfo.opcode[j] != 0) {
+            odesc.aux0 = oinfo.opcode[j];
+            assert((odesc.aux0 & OpcodeByteKind_KindMask) != 0);
+            ++j;
+            if(oinfo.opcode[j] != 0) {
+                odesc.aux1 = oinfo.opcode[j];
+                assert((odesc.aux1 & OpcodeByteKind_KindMask) != 0);
+            }
+        }
+        else if (oinfo.roles.count>=2) {
+            if (((oinfo.opnds[0].kind&OpndKind_Mem) &&
+                 (isRegKind(oinfo.opnds[1].kind))) ||
+                ((oinfo.opnds[1].kind&OpndKind_Mem) &&
+                 (isRegKind(oinfo.opnds[0].kind)))) {
+                 // Example: MOVQ xmm1, xmm/m64 has only opcodes
+                 // same with SHRD
+                 // Adding fake /r
+                 odesc.aux0 = _r;
+            }
+        }
+        else if (oinfo.roles.count==1) {
+            if (oinfo.opnds[0].kind&OpndKind_Mem) {
+                 // Example: SETcc r/m8, adding fake /0
+                 odesc.aux0 = _0;
+            }
+        }
+        // check imm
+        if (oinfo.roles.count > 0 &&
+            (oinfo.opnds[0].kind == OpndKind_Imm ||
+            oinfo.opnds[oinfo.roles.count-1].kind == OpndKind_Imm)) {
+            // Example: CALL cd, PUSH imm32 - they fit both opnds[0] and
+            // opnds[oinfo.roles.count-1].
+            // The A3 opcode fits only opnds[0] - it's currently have
+            // MOV imm32, EAX. Looks ridiculous, but this is how the
+            // moffset is currently implemented. Will need to fix together
+            // with other usages of moff.
+            // adding fake /cd or fake /id
+            unsigned imm_opnd_index =
+                oinfo.opnds[0].kind == OpndKind_Imm ? 0 : oinfo.roles.count-1;
+            OpndSize sz = oinfo.opnds[imm_opnd_index].size;
+            unsigned imm_encode, coff_encode;
+            if (sz==OpndSize_8) {imm_encode = ib; coff_encode=cb; }
+            else if (sz==OpndSize_16) {imm_encode = iw; coff_encode=cw;}
+            else if (sz==OpndSize_32) {imm_encode = id; coff_encode=cd; }
+            else if (sz==OpndSize_64) {imm_encode = io; coff_encode=0xCC; }
+            else { assert(false); imm_encode=0xCC; coff_encode=0xCC; }
+            if (odesc.aux1 == 0) {
+                if (odesc.aux0==0) {
+                    odesc.aux0 = imm_encode;
+                }
+                else {
+                    if (odesc.aux0 != imm_encode && odesc.aux0 != coff_encode) {
+                        odesc.aux1 = imm_encode;
+                    }
+                }
+            }
+            else {
+                assert(odesc.aux1==imm_encode);
+            }
+
+        }
+
+        assert(sizeof(odesc.opnds) == sizeof(oinfo.opnds));
+        memcpy(odesc.opnds, oinfo.opnds,
+                sizeof(EncoderBase::OpndDesc)
+                        * EncoderBase::MAX_NUM_OPCODE_OPERANDS);
+        odesc.roles = oinfo.roles;
+        odesc.first_opnd = 0;
+        if (odesc.opnds[0].reg != RegName_Null) {
+            ++odesc.first_opnd;
+            if (odesc.opnds[1].reg != RegName_Null) {
+                ++odesc.first_opnd;
+            }
+        }
+
+        if (odesc.platf == OpcodeInfo::decoder) {
+            // if the opcode is only for decoding info, then do not hash it.
+            ++oindex;
+            continue;
+        }
+
+        //
+        // check whether the operand info is a mask (i.e. r_m*).
+        // in this case, split the info to have separate entries for 'r'
+        // and for 'm'.
+        // the good news is that there can be only one such operand.
+        //
+        int opnd2split = -1;
+        for (unsigned k=0; k<oinfo.roles.count; k++) {
+            if ((oinfo.opnds[k].kind & OpndKind_Mem) &&
+                (OpndKind_Mem != oinfo.opnds[k].kind)) {
+                opnd2split = k;
+                break;
+            }
+        };
+
+        if (opnd2split == -1) {
+            // not a mask, hash it, store it, continue.
+            unsigned short hash = getHash(&oinfo);
+            opcodesHashMap[minfo->mn][hash] = (unsigned char)oindex;
+            ++oindex;
+            continue;
+        };
+
+        OpcodeInfo storeItem = oinfo;
+        unsigned short hash;
+
+        // remove the memory part of the mask, and store only 'r' part
+        storeItem.opnds[opnd2split].kind = (OpndKind)(storeItem.opnds[opnd2split].kind & ~OpndKind_Mem);
+        hash = getHash(&storeItem);
+        if (opcodesHashMap[minfo->mn][hash] == NOHASH) {
+            opcodesHashMap[minfo->mn][hash] = (unsigned char)oindex;
+        }
+        // else {
+        // do not overwrite if there is something there, just check that operands match
+        // the reason is that for some instructions there are several possibilities:
+        // say 'DEC r' may be encode as either '48+r' or 'FF /1', and I believe
+        // the first one is better for 'dec r'.
+        // as we're currently processing an opcode with memory part in operand,
+        // leave already filled items intact, so if there is 'OP reg' there, this
+        // better choice will be left in the table instead of 'OP r_m'
+        // }
+
+        // compute hash of memory-based operand, 'm' part in 'r_m'
+        storeItem.opnds[opnd2split].kind = OpndKind_Mem;
+        hash = getHash(&storeItem);
+        // should not happen: for the r_m opcodes, there is a possibility
+        // that hash value of 'r' part intersects with 'OP r' value, but it's
+        // impossible for 'm' part.
+        assert(opcodesHashMap[minfo->mn][hash] == NOHASH);
+        opcodesHashMap[minfo->mn][hash] = (unsigned char)oindex;
+
+        ++oindex;
+    }
+}
+
+ENCODER_NAMESPACE_END
diff --git a/libpixelflinger/codeflinger/x86/libenc/enc_wrapper.cpp b/libpixelflinger/codeflinger/x86/libenc/enc_wrapper.cpp
new file mode 100644
index 0000000..b8abffe
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/enc_wrapper.cpp
@@ -0,0 +1,836 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+#include "enc_base.h"
+#include "enc_wrapper.h"
+#include "dec_base.h"
+#include "utils/Log.h"
+
+//#define PRINT_ENCODER_STREAM
+bool dump_x86_inst = false;
+
+/**
+ * @brief Provides mapping between PhysicalReg and RegName used by encoder
+ * @param physicalReg The physical register
+ * @return Returns encoder's register name
+ */
+static RegName mapFromPhysicalReg (int physicalReg)
+{
+    RegName reg = RegName_Null;
+
+    //Get mapping between PhysicalReg and RegName
+    switch (physicalReg)
+    {
+        case PhysicalReg_EAX:
+            reg = RegName_EAX;
+            break;
+        case PhysicalReg_EBX:
+            reg = RegName_EBX;
+            break;
+        case PhysicalReg_ECX:
+            reg = RegName_ECX;
+            break;
+        case PhysicalReg_EDX:
+            reg = RegName_EDX;
+            break;
+        case PhysicalReg_EDI:
+            reg = RegName_EDI;
+            break;
+        case PhysicalReg_ESI:
+            reg = RegName_ESI;
+            break;
+        case PhysicalReg_ESP:
+            reg = RegName_ESP;
+            break;
+        case PhysicalReg_EBP:
+            reg = RegName_EBP;
+            break;
+        case PhysicalReg_XMM0:
+            reg = RegName_XMM0;
+            break;
+        case PhysicalReg_XMM1:
+            reg = RegName_XMM1;
+            break;
+        case PhysicalReg_XMM2:
+            reg = RegName_XMM2;
+            break;
+        case PhysicalReg_XMM3:
+            reg = RegName_XMM3;
+            break;
+        case PhysicalReg_XMM4:
+            reg = RegName_XMM4;
+            break;
+        case PhysicalReg_XMM5:
+            reg = RegName_XMM5;
+            break;
+        case PhysicalReg_XMM6:
+            reg = RegName_XMM6;
+            break;
+        case PhysicalReg_XMM7:
+            reg = RegName_XMM7;
+            break;
+        default:
+            //We have no mapping
+            reg = RegName_Null;
+            break;
+    }
+
+    return reg;
+}
+
+//getRegSize, getAliasReg:
+//OpndSize, RegName, OpndExt: enum enc_defs.h
+inline void add_r(EncoderBase::Operands & args, int physicalReg, OpndSize sz, OpndExt ext = OpndExt_None) {
+    if (sz == OpndSize_128)
+    {
+        //For xmm registers, the encoder table contains them as 64-bit operands. Since semantics are determined
+        //by the encoding of the mnemonic, we change the size to 64-bit to make encoder happy. It will still
+        //generate the code for 128-bit size since for 64-bit all instructions have different encoding to use mmx.
+        sz = OpndSize_64;
+    }
+
+    RegName reg = mapFromPhysicalReg (physicalReg);
+    if (sz != getRegSize(reg)) {
+       reg = getAliasReg(reg, sz);
+    }
+    args.add(EncoderBase::Operand(reg, ext));
+}
+inline void add_m(EncoderBase::Operands & args, int baseReg, int disp, OpndSize sz, OpndExt ext = OpndExt_None) {
+    if (sz == OpndSize_128)
+    {
+        //For xmm registers, the encoder table contains them as 64-bit operands. Since semantics are determined
+        //by the encoding of the mnemonic, we change the size to 64-bit to make encoder happy. It will still
+        //generate the code for 128-bit size since for 64-bit all instructions have different encoding to use mmx.
+        sz = OpndSize_64;
+    }
+
+    args.add(EncoderBase::Operand(sz,
+                                  mapFromPhysicalReg (baseReg),
+                                  RegName_Null, 0,
+                                  disp, ext));
+}
+inline void add_m_scale(EncoderBase::Operands & args, int baseReg, int indexReg, int scale,
+                        OpndSize sz, OpndExt ext = OpndExt_None) {
+    if (sz == OpndSize_128)
+    {
+        //For xmm registers, the encoder table contains them as 64-bit operands. Since semantics are determined
+        //by the encoding of the mnemonic, we change the size to 64-bit to make encoder happy. It will still
+        //generate the code for 128-bit size since for 64-bit all instructions have different encoding to use mmx.
+        sz = OpndSize_64;
+    }
+
+    args.add(EncoderBase::Operand(sz,
+                                  mapFromPhysicalReg (baseReg),
+                                  mapFromPhysicalReg (indexReg), scale,
+                                  0, ext));
+}
+inline void add_m_disp_scale(EncoderBase::Operands & args, int baseReg, int disp, int indexReg, int scale,
+                        OpndSize sz, OpndExt ext = OpndExt_None) {
+    if (sz == OpndSize_128)
+    {
+        //For xmm registers, the encoder table contains them as 64-bit operands. Since semantics are determined
+        //by the encoding of the mnemonic, we change the size to 64-bit to make encoder happy. It will still
+        //generate the code for 128-bit size since for 64-bit all instructions have different encoding to use mmx.
+        sz = OpndSize_64;
+    }
+
+    args.add(EncoderBase::Operand(sz,
+                                  mapFromPhysicalReg (baseReg),
+                                  mapFromPhysicalReg (indexReg), scale,
+                                  disp, ext));
+}
+
+inline void add_fp(EncoderBase::Operands & args, unsigned i, bool dbl) {
+    return args.add((RegName)( (dbl ? RegName_FP0D : RegName_FP0S) + i));
+}
+inline void add_imm(EncoderBase::Operands & args, OpndSize sz, int value, bool is_signed) {
+    //assert(n_size != imm.get_size());
+    args.add(EncoderBase::Operand(sz, value,
+             is_signed ? OpndExt_Signed : OpndExt_Zero));
+}
+
+#define MAX_DECODED_STRING_LEN 1024
+char tmpBuffer[MAX_DECODED_STRING_LEN];
+
+void printOperand(const EncoderBase::Operand & opnd) {
+    unsigned int sz;
+    if(!dump_x86_inst) return;
+    sz = strlen(tmpBuffer);
+    if(opnd.size() != OpndSize_32) {
+        const char * opndSizeString = getOpndSizeString(opnd.size());
+
+        if (opndSizeString == NULL) {
+            // If the string that represents operand size is null it means that
+            // the operand size is an invalid value. Although this could be a
+            // problem if instruction is corrupted, technically failing to
+            // disassemble is not fatal. Thus, let's warn but proceed with using
+            // an empty string.
+            ALOGW("JIT-WARNING: Cannot decode instruction operand size.");
+            opndSizeString = "";
+        }
+
+        sz += snprintf(&tmpBuffer[sz], MAX_DECODED_STRING_LEN - sz, "%s ",
+                opndSizeString);
+    }
+    if(opnd.is_mem()) {
+        if(opnd.scale() != 0) {
+            sz += snprintf(&tmpBuffer[sz], MAX_DECODED_STRING_LEN-sz,
+                           "%d(%s,%s,%d)", opnd.disp(),
+                           getRegNameString(opnd.base()),
+                           getRegNameString(opnd.index()), opnd.scale());
+        } else {
+            sz += snprintf(&tmpBuffer[sz], MAX_DECODED_STRING_LEN-sz, "%d(%s)",
+                           opnd.disp(), getRegNameString(opnd.base()));
+        }
+    }
+    if(opnd.is_imm()) {
+        sz += snprintf(&tmpBuffer[sz], MAX_DECODED_STRING_LEN-sz, "#%x",
+                       (int)opnd.imm());
+    }
+    if(opnd.is_reg()) {
+        sz += snprintf(&tmpBuffer[sz], MAX_DECODED_STRING_LEN-sz, "%s",
+                       getRegNameString(opnd.reg()));
+    }
+}
+//TODO: the order of operands
+//to make the printout have the same order as assembly in .S
+//I reverse the order here
+void printDecoderInst(Inst & decInst) {
+    unsigned int sz;
+    if(!dump_x86_inst) return;
+    sz = strlen(tmpBuffer);
+    sz += snprintf(&tmpBuffer[sz], MAX_DECODED_STRING_LEN-sz, "%s ",
+                   EncoderBase::toStr(decInst.mn));
+    for(unsigned int k = 0; k < decInst.argc; k++) {
+        if(k > 0) {
+            sz = strlen(tmpBuffer);
+            sz += snprintf(&tmpBuffer[sz], MAX_DECODED_STRING_LEN-sz, ", ");
+        }
+        printOperand(decInst.operands[decInst.argc-1-k]);
+    }
+    ALOGE("%s", tmpBuffer);
+}
+void printOperands(EncoderBase::Operands& opnds) {
+    unsigned int sz;
+    if(!dump_x86_inst) return;
+    for(unsigned int k = 0; k < opnds.count(); k++) {
+        if(k > 0) {
+            sz = strlen(tmpBuffer);
+            sz += snprintf(&tmpBuffer[sz], MAX_DECODED_STRING_LEN-sz, ", ");
+        }
+        printOperand(opnds[opnds.count()-1-k]);
+    }
+}
+void printEncoderInst(Mnemonic m, EncoderBase::Operands& opnds) {
+    if(!dump_x86_inst) return;
+    snprintf(tmpBuffer, MAX_DECODED_STRING_LEN, "--- ENC %s ",
+             EncoderBase::toStr(m));
+    printOperands(opnds);
+    ALOGE("%s", tmpBuffer);
+}
+int decodeThenPrint(char* stream_start) {
+    if(!dump_x86_inst) return 0;
+    snprintf(tmpBuffer, MAX_DECODED_STRING_LEN, "--- INST @ %p: ",
+             stream_start);
+    Inst decInst;
+    unsigned numBytes = DecoderBase::decode(stream_start, &decInst);
+    printDecoderInst(decInst);
+    return numBytes;
+}
+
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_imm(Mnemonic m, OpndSize size, int imm, char * stream) {
+    EncoderBase::Operands args;
+    //assert(imm.get_size() == size_32);
+    add_imm(args, size, imm, true/*is_signed*/);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT unsigned encoder_get_inst_size(char * stream) {
+    Inst decInst;
+    unsigned numBytes = DecoderBase::decode(stream, &decInst);
+    return numBytes;
+}
+
+extern "C" ENCODER_DECLARE_EXPORT uintptr_t encoder_get_cur_operand_offset(int opnd_id)
+{
+    return (uintptr_t)EncoderBase::getOpndLocation(opnd_id);
+}
+
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_update_imm(int imm, char * stream) {
+    Inst decInst;
+    EncoderBase::Operands args;
+
+    //Decode the instruction
+    DecoderBase::decode(stream, &decInst);
+
+    add_imm(args, decInst.operands[0].size(), imm, true/*is_signed*/);
+    char* stream_next = (char *)EncoderBase::encode(stream, decInst.mn, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(decInst.mn, args);
+    decodeThenPrint(stream);
+#endif
+    return stream_next;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_mem(Mnemonic m, OpndSize size,
+               int disp, int base_reg, bool isBasePhysical, char * stream) {
+    EncoderBase::Operands args;
+    add_m(args, base_reg, disp, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_reg(Mnemonic m, OpndSize size,
+               int reg, bool isPhysical, LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    if(m == Mnemonic_DIV || m == Mnemonic_IDIV || m == Mnemonic_MUL || m == Mnemonic_IMUL) {
+      add_r(args, 0/*eax*/, size);
+      add_r(args, 3/*edx*/, size);
+    }
+    add_r(args, reg, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+//! \brief Allows for different operand sizes
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_imm_reg(Mnemonic m, OpndSize size,
+                   int imm, int reg, bool isPhysical, LowOpndRegType type, char * stream) {
+    return encoder_imm_reg_diff_sizes(m, size, imm, size, reg, isPhysical, type, stream);
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_reg_reg_diff_sizes(Mnemonic m, OpndSize srcOpndSize,
+                   int reg, bool isPhysical, OpndSize destOpndSize,
+                   int reg2, bool isPhysical2, LowOpndRegType type, char * stream) {
+    if((m == Mnemonic_MOV || m == Mnemonic_MOVQ || m == Mnemonic_MOVD) && reg == reg2) return stream;
+    EncoderBase::Operands args;
+    add_r(args, reg2, destOpndSize); //destination
+    if(m == Mnemonic_SAL || m == Mnemonic_SHR || m == Mnemonic_SHL || m == Mnemonic_SAR)
+      add_r(args, reg, OpndSize_8);
+    else
+      add_r(args, reg, srcOpndSize);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+//both operands have same size
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_reg_reg(Mnemonic m, OpndSize size,
+                   int reg, bool isPhysical,
+                   int reg2, bool isPhysical2, LowOpndRegType type, char * stream) {
+    return encoder_reg_reg_diff_sizes(m, size, reg, isPhysical, size, reg2, isPhysical2, type, stream);
+}
+//! \brief Allows for different operand sizes
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_mem_to_reg_diff_sizes(Mnemonic m, OpndSize memOpndSize,
+                   int disp, int base_reg, bool isBasePhysical, OpndSize regOpndSize,
+                   int reg, bool isPhysical, LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    add_r(args, reg, regOpndSize);
+    add_m(args, base_reg, disp, memOpndSize);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_mem_reg(Mnemonic m, OpndSize size,
+                   int disp, int base_reg, bool isBasePhysical,
+                   int reg, bool isPhysical, LowOpndRegType type, char * stream) {
+    return encoder_mem_to_reg_diff_sizes(m, size, disp, base_reg, isBasePhysical, size, reg, isPhysical, type, stream);
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_mem_scale_reg(Mnemonic m, OpndSize size,
+                         int base_reg, bool isBasePhysical, int index_reg, bool isIndexPhysical, int scale,
+                         int reg, bool isPhysical, LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    add_r(args, reg, size);
+    add_m_scale(args, base_reg, index_reg, scale, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_reg_mem_scale(Mnemonic m, OpndSize size,
+                         int reg, bool isPhysical,
+                         int base_reg, bool isBasePhysical, int index_reg, bool isIndexPhysical, int scale,
+                         LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    add_m_scale(args, base_reg, index_reg, scale, size);
+    add_r(args, reg, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+//! \brief Allows for different operand sizes
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_mem_disp_scale_to_reg_diff_sizes(Mnemonic m, OpndSize memOpndSize,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         OpndSize regOpndSize, int reg, bool isPhysical, LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    add_r(args, reg, regOpndSize);
+    add_m_disp_scale(args, base_reg, disp, index_reg, scale, memOpndSize);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_mem_disp_scale_reg(Mnemonic m, OpndSize size,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         int reg, bool isPhysical, LowOpndRegType type, char * stream) {
+    return encoder_mem_disp_scale_to_reg_diff_sizes(m, size, base_reg, isBasePhysical,
+            disp, index_reg, isIndexPhysical, scale, size, reg, isPhysical,
+            type, stream);
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_movzs_mem_disp_scale_reg(Mnemonic m, OpndSize size,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         int reg, bool isPhysical, LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    add_r(args, reg, OpndSize_32);
+    add_m_disp_scale(args, base_reg, disp, index_reg, scale, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char* encoder_reg_mem_disp_scale(Mnemonic m, OpndSize size,
+                         int reg, bool isPhysical,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         LowOpndRegType type, char* stream) {
+    EncoderBase::Operands args;
+    add_m_disp_scale(args, base_reg, disp, index_reg, scale, size);
+    add_r(args, reg, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_reg_mem(Mnemonic m, OpndSize size,
+                   int reg, bool isPhysical,
+                   int disp, int base_reg, bool isBasePhysical, LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    add_m(args, base_reg, disp, size);
+    add_r(args, reg, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    if (m == Mnemonic_CMPXCHG ){
+       //CMPXCHG require EAX as args
+       add_r(args,PhysicalReg_EAX,size);
+       //Add lock prefix for CMPXCHG, guarantee the atomic of CMPXCHG in multi-core platform
+       stream = (char *)EncoderBase::prefix(stream, InstPrefix_LOCK);
+    }
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_imm_reg_diff_sizes (Mnemonic m, OpndSize sizeImm, int imm,
+        OpndSize sizeReg, int reg, bool isPhysical, LowOpndRegType type, char * stream)
+{
+    //Create the operands
+    EncoderBase::Operands args;
+    //Add destination register
+    add_r (args, reg, sizeReg);
+    //For imul, we need to add implicit register explicitly
+    if (m == Mnemonic_IMUL)
+    {
+        add_r (args, reg, sizeReg);
+    }
+    //Finally add the immediate
+    add_imm (args, sizeImm, imm, true/*is_signed*/);
+
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+
+    //Now do the encoding
+    stream = EncoderBase::encode (stream, m, args);
+
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_update_imm_rm(int imm, char * stream) {
+    Inst decInst;
+    EncoderBase::Operands args;
+
+    //Decode the instruction
+    DecoderBase::decode(stream, &decInst);
+
+    args.add(decInst.operands[0]);
+    add_imm(args, decInst.operands[1].size(), imm, true/*is_signed*/);
+    char* stream_next = (char *)EncoderBase::encode(stream, decInst.mn, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(decInst.mn, args);
+    decodeThenPrint(stream);
+#endif
+    return stream_next;
+}
+
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_imm_mem(Mnemonic m, OpndSize size,
+                   int imm,
+                   int disp, int base_reg, bool isBasePhysical, char * stream) {
+    return encoder_imm_mem_diff_sizes(m, size, imm, size, disp, base_reg, isBasePhysical, stream);
+}
+
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_imm_mem_diff_sizes (Mnemonic m, OpndSize immOpndSize, int imm,
+        OpndSize memOpndSize, int disp, int baseRegister, bool isBasePhysical, char * stream)
+{
+    //Add operands
+    EncoderBase::Operands args;
+    add_m (args, baseRegister, disp, memOpndSize);
+    add_imm (args, immOpndSize, imm, true);
+
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+
+    //Do the encoding
+    stream = EncoderBase::encode (stream, m, args);
+
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+
+    return stream;
+}
+
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_fp_mem(Mnemonic m, OpndSize size, int reg,
+                  int disp, int base_reg, bool isBasePhysical, char * stream) {
+    EncoderBase::Operands args;
+    add_m(args, base_reg, disp, size);
+    // a fake FP register as operand
+    add_fp(args, reg, size == OpndSize_64/*is_double*/);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_mem_fp(Mnemonic m, OpndSize size,
+                  int disp, int base_reg, bool isBasePhysical,
+                  int reg, char * stream) {
+    EncoderBase::Operands args;
+    // a fake FP register as operand
+    add_fp(args, reg, size == OpndSize_64/*is_double*/);
+    add_m(args, base_reg, disp, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_return(char * stream) {
+    EncoderBase::Operands args;
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, Mnemonic_RET, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(Mnemonic_RET, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_compare_fp_stack(bool pop, int reg, bool isDouble, char * stream) {
+    Mnemonic m = pop ? Mnemonic_FUCOMIP : Mnemonic_FUCOMI;
+    //a single operand or 2 operands?
+    //FST ST(i) has a single operand in encoder.inl?
+    EncoderBase::Operands args;
+    add_fp(args, reg, isDouble);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, m, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(m, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_movez_mem_to_reg(OpndSize size,
+                      int disp, int base_reg, bool isBasePhysical,
+                      int reg, bool isPhysical, char * stream) {
+    EncoderBase::Operands args;
+    add_r(args, reg, OpndSize_32);
+    add_m(args, base_reg, disp, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, Mnemonic_MOVZX, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(Mnemonic_MOVZX, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_moves_mem_to_reg(OpndSize size,
+                      int disp, int base_reg, bool isBasePhysical,
+                      int reg, bool isPhysical, char * stream) {
+    EncoderBase::Operands args;
+    add_r(args, reg, OpndSize_32);
+    add_m(args, base_reg, disp, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, Mnemonic_MOVSX, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(Mnemonic_MOVSX, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_movez_reg_to_reg(OpndSize size,
+                      int reg, bool isPhysical, int reg2,
+                      bool isPhysical2, LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    add_r(args, reg2, OpndSize_32); //destination
+    add_r(args, reg, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, Mnemonic_MOVZX, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(Mnemonic_MOVZX, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_moves_reg_to_reg(OpndSize size,
+                      int reg, bool isPhysical,int reg2,
+                      bool isPhysical2, LowOpndRegType type, char * stream) {
+    EncoderBase::Operands args;
+    add_r(args, reg2, OpndSize_32); //destination
+    add_r(args, reg, size);
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+    stream = (char *)EncoderBase::encode(stream, Mnemonic_MOVSX, args);
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst(Mnemonic_MOVSX, args);
+    decodeThenPrint(stream_start);
+#endif
+    return stream;
+}
+
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_imm_reg_reg (Mnemonic m, int imm, OpndSize immediateSize,
+        int sourceReg, OpndSize sourceRegSize, int destReg, OpndSize destRegSize, char * stream)
+{
+    EncoderBase::Operands args;
+
+    //Add the source and destination registers
+    add_r (args, destReg, destRegSize);
+    add_r (args, sourceReg, sourceRegSize);
+
+    //Now add the immediate. We expect in three operand situation that immediate is last argument
+    add_imm (args, immediateSize, imm, true/*is_signed*/);
+
+#ifdef PRINT_ENCODER_STREAM
+    char* stream_start = stream;
+#endif
+
+    //Do the actual encoding
+    stream = EncoderBase::encode (stream, m, args);
+
+#ifdef PRINT_ENCODER_STREAM
+    printEncoderInst (m, args);
+    decodeThenPrint (stream_start);
+#endif
+
+    //Return the updated stream pointer
+    return stream;
+}
+
+/**
+ * @brief Generates variable sized nop instructions.
+ * @param numBytes Number of bytes for the nop instruction. If this value is
+ * larger than 9 bytes, more than one nop instruction will be generated.
+ * @param stream Instruction stream where to place the nops
+ * @return Updated instruction stream pointer after generating the nops
+ */
+extern "C" ENCODER_DECLARE_EXPORT char * encoder_nops(unsigned numBytes, char * stream) {
+    return EncoderBase::nops(stream, numBytes);
+}
+
+// Disassemble the operand "opnd" and put the readable format in "strbuf"
+// up to a string length of "len".
+unsigned int DisassembleOperandToBuf(const EncoderBase::Operand& opnd, char* strbuf, unsigned int len)
+{
+    unsigned int sz = 0;
+    if(opnd.size() != OpndSize_32) {
+        const char * opndSizeString = getOpndSizeString(opnd.size());
+
+        if (opndSizeString == NULL) {
+            // If the string that represents operand size is null it means that
+            // the operand size is an invalid value. Although this could be a
+            // problem if instruction is corrupted, technically failing to
+            // disassemble is not fatal. Thus, let's warn but proceed with using
+            // an empty string.
+            ALOGW("JIT-WARNING: Cannot decode instruction operand size.");
+            opndSizeString = "";
+        }
+
+        sz += snprintf(&strbuf[sz], len-sz, "%s ", opndSizeString);
+    }
+    if(opnd.is_mem()) {
+        if(opnd.scale() != 0) {
+            sz += snprintf(&strbuf[sz], len-sz, "%d(%s,%s,%d)", opnd.disp(),
+                           getRegNameString(opnd.base()),
+                           getRegNameString(opnd.index()), opnd.scale());
+        } else {
+            sz += snprintf(&strbuf[sz], len-sz, "%d(%s)",
+                           opnd.disp(), getRegNameString(opnd.base()));
+        }
+    } else if(opnd.is_imm()) {
+        sz += snprintf(&strbuf[sz], len-sz, "#%x", (int)opnd.imm());
+    } else if(opnd.is_reg()) {
+        sz += snprintf(&strbuf[sz], len-sz, "%s",
+                       getRegNameString(opnd.reg()));
+    }
+    return sz;
+}
+
+// Disassemble the instruction "decInst" and put the readable format
+// in "strbuf" up to a string length of "len".
+void DisassembleInstToBuf(Inst& decInst, char* strbuf, unsigned int len)
+{
+    unsigned int sz = 0;
+    int k;
+    sz += snprintf(&strbuf[sz], len-sz, "%s ", EncoderBase::toStr(decInst.mn));
+    if (decInst.argc > 0) {
+        sz += DisassembleOperandToBuf(decInst.operands[decInst.argc-1],
+                                 &strbuf[sz], len-sz);
+        for(k = decInst.argc-2; k >= 0; k--) {
+            sz += snprintf(&strbuf[sz], len-sz, ", ");
+            sz += DisassembleOperandToBuf(decInst.operands[k], &strbuf[sz], len-sz);
+        }
+    }
+}
+
+// Disassmble the x86 instruction pointed to by code pointer "stream."
+// Put the disassemble text in the "strbuf" up to string length "len".
+// Return the code pointer after the disassemble x86 instruction.
+extern "C" ENCODER_DECLARE_EXPORT
+char* decoder_disassemble_instr(char* stream, char* strbuf, unsigned int len)
+{
+    Inst decInst;
+    unsigned numBytes = DecoderBase::decode(stream, &decInst);
+    DisassembleInstToBuf(decInst, strbuf, len);
+    return (stream + numBytes);
+}
+
+/**
+ * @brief Physical register char* counterparts
+ */
+static const char * PhysicalRegString[] = { "eax", "ebx", "ecx", "edx", "edi",
+        "esi", "esp", "ebp", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7", "st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7",
+        "null"
+        };
+
+/**
+ * @brief Scratch register char* counterparts
+ */
+static const char * ScratchRegString[] = { "scratch1", "scratch2", "scratch3",
+        "scratch4", "scratch5", "scratch6", "scratch7", "scratch8", "scratch9",
+        "scratch10" };
+
+extern "C" ENCODER_DECLARE_EXPORT
+/**
+ * @brief Transform a physical register into its char* counterpart
+ * @param reg the PhysicalReg we want to have a char* equivalent
+ * @return the register reg in char* form
+ */
+const char * physicalRegToString(PhysicalReg reg)
+{
+    if (reg < PhysicalReg_Null) {
+        return PhysicalRegString[reg];
+    } else if (reg >= PhysicalReg_SCRATCH_1 && reg <= PhysicalReg_SCRATCH_10) {
+        return ScratchRegString[reg - PhysicalReg_SCRATCH_1];
+    } else if (reg == PhysicalReg_Null) {
+        return "null";
+    } else {
+        return "corrupted-data";
+    }
+}
diff --git a/libpixelflinger/codeflinger/x86/libenc/enc_wrapper.h b/libpixelflinger/codeflinger/x86/libenc/enc_wrapper.h
new file mode 100644
index 0000000..3d2e68d
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/enc_wrapper.h
@@ -0,0 +1,283 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _VM_ENC_WRAPPER_H_
+#define _VM_ENC_WRAPPER_H_
+
+#include "enc_defs_ext.h"
+
+extern bool dump_x86_inst;
+typedef enum PhysicalReg {
+  // Currently initializing StartOfGPMarker to be 0 in order to match
+  // register index in Reg_No. However, ideally PhysicalReg_Null should
+  // be 0 and the rest moved over.
+  PhysicalReg_StartOfGPMarker = 0,
+  PhysicalReg_EAX = PhysicalReg_StartOfGPMarker,
+  PhysicalReg_EBX, PhysicalReg_ECX, PhysicalReg_EDX,
+  PhysicalReg_EDI, PhysicalReg_ESI, PhysicalReg_ESP, PhysicalReg_EBP,
+  PhysicalReg_EndOfGPMarker = PhysicalReg_EBP,
+
+  PhysicalReg_StartOfXmmMarker,
+  PhysicalReg_XMM0 = PhysicalReg_StartOfXmmMarker,
+  PhysicalReg_XMM1, PhysicalReg_XMM2, PhysicalReg_XMM3,
+  PhysicalReg_XMM4, PhysicalReg_XMM5, PhysicalReg_XMM6, PhysicalReg_XMM7,
+  PhysicalReg_EndOfXmmMarker = PhysicalReg_XMM7,
+
+  PhysicalReg_StartOfX87Marker,
+  PhysicalReg_ST0 = PhysicalReg_StartOfX87Marker,  PhysicalReg_ST1,
+  PhysicalReg_ST2, PhysicalReg_ST3, PhysicalReg_ST4, PhysicalReg_ST5,
+  PhysicalReg_ST6, PhysicalReg_ST7,
+  PhysicalReg_EndOfX87Marker = PhysicalReg_ST7,
+
+  PhysicalReg_Null,
+  //used as scratch logical register in NCG O1
+  //should not overlap with regular logical register, start from 100
+  PhysicalReg_SCRATCH_1 = 100, PhysicalReg_SCRATCH_2, PhysicalReg_SCRATCH_3, PhysicalReg_SCRATCH_4,
+  PhysicalReg_SCRATCH_5, PhysicalReg_SCRATCH_6, PhysicalReg_SCRATCH_7, PhysicalReg_SCRATCH_8,
+  PhysicalReg_SCRATCH_9, PhysicalReg_SCRATCH_10,
+
+  //This should be the last entry
+  PhysicalReg_Last = PhysicalReg_SCRATCH_10
+} PhysicalReg;
+
+typedef enum Reg_No {
+#ifdef _EM64T_
+    rax_reg = 0,rbx_reg,    rcx_reg,    rdx_reg,
+    rdi_reg,    rsi_reg,    rsp_reg,    rbp_reg,
+    r8_reg,     r9_reg,     r10_reg,    r11_reg,
+    r12_reg,    r13_reg,    r14_reg,    r15_reg,
+    xmm0_reg,   xmm1_reg,   xmm2_reg,   xmm3_reg,
+    xmm4_reg,   xmm5_reg,   xmm6_reg,   xmm7_reg,
+    xmm8_reg,   xmm9_reg,   xmm10_reg,  xmm11_reg,
+    xmm12_reg,  xmm13_reg,  xmm14_reg,  xmm15_reg,
+
+#else   // !defined(_EM64T_)
+
+    eax_reg = 0,ebx_reg,    ecx_reg,    edx_reg,
+    edi_reg,    esi_reg,    esp_reg,    ebp_reg,
+    xmm0_reg,   xmm1_reg,   xmm2_reg,   xmm3_reg,
+    xmm4_reg,   xmm5_reg,   xmm6_reg,   xmm7_reg,
+    fs_reg,
+#endif
+    /** @brief Total number of registers.*/
+    n_reg
+} Reg_No;
+//
+// instruction operand sizes: 8,16,32,64 bits
+//
+typedef enum Opnd_Size {
+    size_8 = 0,
+    size_16,
+    size_32,
+    size_64,
+    n_size,
+#ifdef _EM64T_
+    size_platf = size_64
+#else
+    size_platf = size_32
+#endif
+} Opnd_Size;
+
+//
+// opcodes for alu instructions
+//
+typedef enum ALU_Opcode {
+    add_opc = 0,or_opc,     adc_opc,    sbb_opc,
+    and_opc,    sub_opc,    xor_opc,    cmp_opc,
+    mul_opc,    imul_opc,   div_opc,    idiv_opc,
+    sll_opc,    srl_opc,    sra_opc, //shift right arithmetic
+    shl_opc,    shr_opc,
+    sal_opc,    sar_opc,
+    neg_opc,    not_opc,    andn_opc,
+    n_alu
+} ALU_Opcode;
+
+typedef enum ConditionCode {
+    Condition_O     = 0,
+    Condition_NO    = 1,
+    Condition_B     = 2,
+    Condition_NAE   = Condition_B,
+    Condition_C     = Condition_B,
+    Condition_NB    = 3,
+    Condition_AE    = Condition_NB,
+    Condition_NC    = Condition_NB,
+    Condition_Z     = 4,
+    Condition_E     = Condition_Z,
+    Condition_NZ    = 5,
+    Condition_NE    = Condition_NZ,
+    Condition_BE    = 6,
+    Condition_NA    = Condition_BE,
+    Condition_NBE   = 7,
+    Condition_A     = Condition_NBE,
+
+    Condition_S     = 8,
+    Condition_NS    = 9,
+    Condition_P     = 10,
+    Condition_PE    = Condition_P,
+    Condition_NP    = 11,
+    Condition_PO    = Condition_NP,
+    Condition_L     = 12,
+    Condition_NGE   = Condition_L,
+    Condition_NL    = 13,
+    Condition_GE    = Condition_NL,
+    Condition_LE    = 14,
+    Condition_NG    = Condition_LE,
+    Condition_NLE   = 15,
+    Condition_G     = Condition_NLE,
+    Condition_Count = 16
+} ConditionCode;
+
+//
+// prefix code
+//
+typedef enum InstrPrefix {
+    no_prefix,
+    lock_prefix                     = 0xF0,
+    hint_branch_taken_prefix        = 0x2E,
+    hint_branch_not_taken_prefix    = 0x3E,
+    prefix_repne                    = 0xF2,
+    prefix_repnz                    = prefix_repne,
+    prefix_repe                     = 0xF3,
+    prefix_repz                     = prefix_repe,
+    prefix_rep                      = 0xF3,
+    prefix_cs                       = 0x2E,
+    prefix_ss                       = 0x36,
+    prefix_ds                       = 0x3E,
+    prefix_es                       = 0x26,
+    prefix_fs                       = 0x64,
+    prefix_gs                       = 0x65
+} InstrPrefix;
+
+enum LowOpndRegType
+{
+    LowOpndRegType_gp = 0,
+    LowOpndRegType_fs = 1,
+    LowOpndRegType_xmm = 2,
+    LowOpndRegType_fs_s = 3,
+    LowOpndRegType_ss = 4,
+    LowOpndRegType_invalid = 256,
+};
+
+enum LogicalRegType
+{
+    LogicalType_invalid = 0,
+    LowOpndRegType_scratch = 8,
+    LowOpndRegType_temp = 16,
+    LowOpndRegType_hard = 32,
+    LowOpndRegType_virtual = 64,
+};
+
+//if inline, separte enc_wrapper.cpp into two files, one of them is .inl
+//           enc_wrapper.cpp needs to handle both cases
+#ifdef ENCODER_INLINE
+    #define ENCODER_DECLARE_EXPORT inline
+    #include "enc_wrapper.inl"
+#else
+    #define ENCODER_DECLARE_EXPORT
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+ENCODER_DECLARE_EXPORT char* encoder_imm(Mnemonic m, OpndSize size,
+                  int imm, char* stream);
+ENCODER_DECLARE_EXPORT unsigned encoder_get_inst_size(char * stream);
+ENCODER_DECLARE_EXPORT char* encoder_update_imm(int imm, char * stream);
+ENCODER_DECLARE_EXPORT char* encoder_mem(Mnemonic m, OpndSize size,
+               int disp, int base_reg, bool isBasePhysical, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_reg(Mnemonic m, OpndSize size,
+               int reg, bool isPhysical, LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_reg_reg(Mnemonic m, OpndSize size,
+                   int reg, bool isPhysical,
+                   int reg2, bool isPhysical2, LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_reg_reg_diff_sizes(Mnemonic m, OpndSize srcOpndSize,
+                   int reg, bool isPhysical, OpndSize destOpndSize,
+                   int reg2, bool isPhysical2, LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_mem_reg(Mnemonic m, OpndSize size,
+                   int disp, int base_reg, bool isBasePhysical,
+                   int reg, bool isPhysical, LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_mem_to_reg_diff_sizes(Mnemonic m, OpndSize memOpndSize,
+                   int disp, int base_reg, bool isBasePhysical, OpndSize regOpndSize,
+                   int reg, bool isPhysical, LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_mem_scale_reg(Mnemonic m, OpndSize size,
+                         int base_reg, bool isBasePhysical, int index_reg, bool isIndexPhysical, int scale,
+                         int reg, bool isPhysical, LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_reg_mem_scale(Mnemonic m, OpndSize size,
+                         int reg, bool isPhysical,
+                         int base_reg, bool isBasePhysical, int index_reg, bool isIndexPhysical, int scale,
+                         LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char * encoder_mem_disp_scale_reg(Mnemonic m, OpndSize size,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         int reg, bool isPhysical, LowOpndRegType type, char * stream);
+ENCODER_DECLARE_EXPORT char * encoder_mem_disp_scale_to_reg_diff_sizes(Mnemonic m, OpndSize memOpndSize,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         OpndSize regOpndSize, int reg, bool isPhysical, LowOpndRegType type, char * stream);
+ENCODER_DECLARE_EXPORT char * encoder_movzs_mem_disp_scale_reg(Mnemonic m, OpndSize size,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         int reg, bool isPhysical, LowOpndRegType type, char * stream);
+ENCODER_DECLARE_EXPORT char * encoder_mem_disp_scale_to_reg_2(Mnemonic m, OpndSize memOpndSize,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         OpndSize regOpndSize, int reg, bool isPhysical, LowOpndRegType type, char * stream);
+ENCODER_DECLARE_EXPORT char* encoder_reg_mem_disp_scale(Mnemonic m, OpndSize size,
+                         int reg, bool isPhysical,
+                         int base_reg, bool isBasePhysical, int disp, int index_reg, bool isIndexPhysical, int scale,
+                         LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_reg_mem(Mnemonic m, OpndSize size,
+                   int reg, bool isPhysical,
+                   int disp, int base_reg, bool isBasePhysical, LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_imm_reg(Mnemonic m, OpndSize size,
+                   int imm, int reg, bool isPhysical, LowOpndRegType type, char* stream);
+ENCODER_DECLARE_EXPORT char * encoder_imm_reg_diff_sizes(Mnemonic m, OpndSize sizeImm,
+                   int imm, OpndSize sizeReg, int reg, bool isPhysical, LowOpndRegType type, char * stream);
+ENCODER_DECLARE_EXPORT char * encoder_update_imm_rm(int imm, char * stream);
+ENCODER_DECLARE_EXPORT char* encoder_imm_mem(Mnemonic m, OpndSize size,
+                   int imm,
+                   int disp, int base_reg, bool isBasePhysical, char* stream);
+ENCODER_DECLARE_EXPORT char * encoder_imm_mem_diff_sizes (Mnemonic m, OpndSize immOpndSize, int imm,
+                   OpndSize memOpndSize, int disp, int baseRegister, bool isBasePhysical, char * stream);
+ENCODER_DECLARE_EXPORT char* encoder_fp_mem(Mnemonic m, OpndSize size, int reg,
+                  int disp, int base_reg, bool isBasePhysical, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_mem_fp(Mnemonic m, OpndSize size,
+                  int disp, int base_reg, bool isBasePhysical,
+                  int reg, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_return(char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_compare_fp_stack(bool pop, int reg, bool isDouble, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_movez_mem_to_reg(OpndSize size,
+                      int disp, int base_reg, bool isBasePhysical,
+                      int reg, bool isPhysical, char* stream);
+ENCODER_DECLARE_EXPORT char* encoder_moves_mem_to_reg(OpndSize size,
+                      int disp, int base_reg, bool isBasePhysical,
+                      int reg, bool isPhysical, char* stream);
+ENCODER_DECLARE_EXPORT char * encoder_movez_reg_to_reg(OpndSize size,
+                      int reg, bool isPhysical, int reg2,
+                      bool isPhysical2, LowOpndRegType type, char * stream);
+ENCODER_DECLARE_EXPORT char * encoder_moves_reg_to_reg(OpndSize size,
+                      int reg, bool isPhysical, int reg2,
+                      bool isPhysical2, LowOpndRegType type, char * stream);
+ENCODER_DECLARE_EXPORT char * encoder_imm_reg_reg (Mnemonic m, int imm, OpndSize immediateSize,
+                      int sourceReg, OpndSize sourceRegSize, int destReg,
+                      OpndSize destRegSize, char * stream);
+ENCODER_DECLARE_EXPORT char * encoder_nops(unsigned numBytes, char * stream);
+ENCODER_DECLARE_EXPORT int decodeThenPrint(char* stream_start);
+ENCODER_DECLARE_EXPORT char* decoder_disassemble_instr(char* stream, char* strbuf, unsigned int len);
+
+//Provide a char* equivalent to a PhysicalReg type
+ENCODER_DECLARE_EXPORT const char * physicalRegToString(PhysicalReg reg);
+#ifdef __cplusplus
+}
+#endif
+#endif // _VM_ENC_WRAPPER_H_
diff --git a/libpixelflinger/codeflinger/x86/libenc/encoder.h b/libpixelflinger/codeflinger/x86/libenc/encoder.h
new file mode 100644
index 0000000..9ac0219
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/encoder.h
@@ -0,0 +1,717 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+/**
+ * @file
+ * @brief Simple interface for generating processor instructions.
+ *
+ * The interface works for both IA32 and EM64T. By default, only IA32
+ * capabilities are presented. To enable EM64T feature, the _EM64T_ macro
+ * must be defined (and, of course, a proper library version to be used).
+ *
+ * The interface is based on the original ia32.h encoder interface,
+ * with some simplifications and add-ons - EM64T-specific, SSE and SSE2.
+ *
+ * The interface mostly intended for existing legacy code like LIL code
+ * generator. From the implementation point of view, it's just a wrapper
+ * around the EncoderBase functionality.
+ */
+
+#ifndef _VM_ENCODER_H_
+#define _VM_ENCODER_H_
+
+#include <limits.h>
+#include "enc_base.h"
+//#include "open/types.h"
+
+#ifdef _EM64T_
+// size of general-purpose value on the stack in bytes
+#define GR_STACK_SIZE 8
+// size of floating-point value on the stack in bytes
+#define FR_STACK_SIZE 8
+
+#if defined(WIN32) || defined(_WIN64)
+    // maximum number of GP registers for inputs
+    const int MAX_GR = 4;
+    // maximum number of FP registers for inputs
+    const int MAX_FR = 4;
+    // WIN64 reserves 4 words for shadow space
+    const int SHADOW = 4 * GR_STACK_SIZE;
+#else
+    // maximum number of GP registers for inputs
+    const int MAX_GR = 6;
+    // maximum number of FP registers for inputs
+    const int MAX_FR = 8;
+    // Linux x64 doesn't reserve shadow space
+    const int SHADOW = 0;
+#endif
+
+#else
+// size of general-purpose value on the stack in bytes
+#define GR_STACK_SIZE 4
+// size of general-purpose value on the stack in bytes
+#define FR_STACK_SIZE 8
+
+// maximum number of GP registers for inputs
+const int MAX_GR = 0;
+// maximum number of FP registers for inputs
+const int MAX_FR = 0;
+#endif
+
+typedef enum Reg_No {
+#ifdef _EM64T_
+    rax_reg = 0,rbx_reg,    rcx_reg,    rdx_reg,
+    rdi_reg,    rsi_reg,    rsp_reg,    rbp_reg,
+    r8_reg,     r9_reg,     r10_reg,    r11_reg,
+    r12_reg,    r13_reg,    r14_reg,    r15_reg,
+    xmm0_reg,   xmm1_reg,   xmm2_reg,   xmm3_reg,
+    xmm4_reg,   xmm5_reg,   xmm6_reg,   xmm7_reg,
+    xmm8_reg,   xmm9_reg,   xmm10_reg,  xmm11_reg,
+    xmm12_reg,  xmm13_reg,  xmm14_reg,  xmm15_reg,
+
+#else   // !defined(_EM64T_)
+
+    eax_reg = 0,ebx_reg,    ecx_reg,    edx_reg,
+    edi_reg,    esi_reg,    esp_reg,    ebp_reg,
+    xmm0_reg,   xmm1_reg,   xmm2_reg,   xmm3_reg,
+    xmm4_reg,   xmm5_reg,   xmm6_reg,   xmm7_reg,
+    fs_reg,
+#endif
+    /** @brief Total number of registers.*/
+    n_reg
+} Reg_No;
+//
+// instruction operand sizes: 8,16,32,64 bits
+//
+typedef enum Opnd_Size {
+    size_8 = 0,
+    size_16,
+    size_32,
+    size_64,
+    n_size,
+#ifdef _EM64T_
+    size_platf = size_64
+#else
+    size_platf = size_32
+#endif
+} Opnd_Size;
+
+//
+// opcodes for alu instructions
+//
+typedef enum ALU_Opcode {
+    add_opc = 0,or_opc,     adc_opc,    sbb_opc,
+    and_opc,    sub_opc,    xor_opc,    cmp_opc,
+    n_alu
+} ALU_Opcode;
+
+//
+// opcodes for shift instructions
+//
+typedef enum Shift_Opcode {
+    shld_opc,   shrd_opc,   shl_opc,    shr_opc,
+    sar_opc,    ror_opc, max_shift_opcode=6,     n_shift = 6
+} Shift_Opcode;
+
+typedef enum ConditionCode {
+    Condition_O     = 0,
+    Condition_NO    = 1,
+    Condition_B     = 2,
+    Condition_NAE   = Condition_B,
+    Condition_C     = Condition_B,
+    Condition_NB    = 3,
+    Condition_AE    = Condition_NB,
+    Condition_NC    = Condition_NB,
+    Condition_Z     = 4,
+    Condition_E     = Condition_Z,
+    Condition_NZ    = 5,
+    Condition_NE    = Condition_NZ,
+    Condition_BE    = 6,
+    Condition_NA    = Condition_BE,
+    Condition_NBE   = 7,
+    Condition_A     = Condition_NBE,
+
+    Condition_S     = 8,
+    Condition_NS    = 9,
+    Condition_P     = 10,
+    Condition_PE    = Condition_P,
+    Condition_NP    = 11,
+    Condition_PO    = Condition_NP,
+    Condition_L     = 12,
+    Condition_NGE   = Condition_L,
+    Condition_NL    = 13,
+    Condition_GE    = Condition_NL,
+    Condition_LE    = 14,
+    Condition_NG    = Condition_LE,
+    Condition_NLE   = 15,
+    Condition_G     = Condition_NLE,
+    Condition_Count = 16
+} ConditionCode;
+
+//
+// prefix code
+//
+typedef enum InstrPrefix {
+    no_prefix,
+    lock_prefix                     = 0xF0,
+    hint_branch_taken_prefix        = 0x2E,
+    hint_branch_not_taken_prefix    = 0x3E,
+    prefix_repne                    = 0xF2,
+    prefix_repnz                    = prefix_repne,
+    prefix_repe                     = 0xF3,
+    prefix_repz                     = prefix_repe,
+    prefix_rep                      = 0xF3,
+    prefix_cs                       = 0x2E,
+    prefix_ss                       = 0x36,
+    prefix_ds                       = 0x3E,
+    prefix_es                       = 0x26,
+    prefix_fs                       = 0x64,
+    prefix_gs                       = 0x65
+} InstrPrefix;
+
+
+//
+// an instruction operand
+//
+class Opnd {
+
+protected:
+    enum Tag { SignedImm, UnsignedImm, Reg, Mem, FP, XMM };
+
+    const Tag  tag;
+
+    Opnd(Tag t): tag(t) {}
+
+public:
+    void * operator new(size_t, void * mem) {
+        return mem;
+    }
+
+    void operator delete(void *) {}
+
+    void operator delete(void *, void *) {}
+
+private:
+    // disallow copying
+    Opnd(const Opnd &): tag(Mem) { assert(false); }
+    Opnd& operator=(const Opnd &) { assert(false); return *this; }
+};
+typedef int I_32;
+class Imm_Opnd: public Opnd {
+
+protected:
+    union {
+#ifdef _EM64T_
+        int64           value;
+        unsigned char   bytes[8];
+#else
+        I_32           value;
+        unsigned char   bytes[4];
+#endif
+    };
+    Opnd_Size           size;
+
+public:
+    Imm_Opnd(I_32 val, bool isSigned = true):
+        Opnd(isSigned ? SignedImm : UnsignedImm), value(val), size(size_32) {
+        if (isSigned) {
+            if (CHAR_MIN <= val && val <= CHAR_MAX) {
+                size = size_8;
+            } else if (SHRT_MIN <= val && val <= SHRT_MAX) {
+                size = size_16;
+            }
+        } else {
+            assert(val >= 0);
+            if (val <= UCHAR_MAX) {
+                size = size_8;
+            } else if (val <= USHRT_MAX) {
+                size = size_16;
+            }
+        }
+    }
+    Imm_Opnd(const Imm_Opnd& that): Opnd(that.tag), value(that.value), size(that.size) {};
+
+#ifdef _EM64T_
+    Imm_Opnd(Opnd_Size sz, int64 val, bool isSigned = true):
+        Opnd(isSigned ? SignedImm : UnsignedImm), value(val), size(sz) {
+#ifndef NDEBUG
+        switch (size) {
+        case size_8:
+            assert(val == (int64)(I_8)val);
+            break;
+        case size_16:
+            assert(val == (int64)(int16)val);
+            break;
+        case size_32:
+            assert(val == (int64)(I_32)val);
+            break;
+        case size_64:
+            break;
+        case n_size:
+            assert(false);
+            break;
+        }
+#endif // NDEBUG
+    }
+
+    int64 get_value() const { return value; }
+
+#else
+
+    Imm_Opnd(Opnd_Size sz, I_32 val, int isSigned = true):
+        Opnd(isSigned ? SignedImm : UnsignedImm), value(val), size(sz) {
+#ifndef NDEBUG
+        switch (size) {
+        case size_8:
+            assert((I_32)val == (I_32)(I_8)val);
+            break;
+        case size_16:
+            assert((I_32)val == (I_32)(int16)val);
+            break;
+        case size_32:
+            break;
+        case size_64:
+        case n_size:
+            assert(false);
+            break;
+        }
+#endif // NDEBUG
+    }
+
+    I_32 get_value() const { return value; }
+
+#endif
+    Opnd_Size get_size() const { return size; }
+    bool      is_signed() const { return tag == SignedImm; }
+};
+
+class RM_Opnd: public Opnd {
+
+public:
+    bool is_reg() const { return tag != SignedImm && tag != UnsignedImm && tag != Mem; }
+
+protected:
+    RM_Opnd(Tag t): Opnd(t) {}
+
+private:
+    // disallow copying
+    RM_Opnd(const RM_Opnd &): Opnd(Reg) { assert(false); }
+};
+
+class R_Opnd: public RM_Opnd {
+
+protected:
+    Reg_No      _reg_no;
+
+public:
+    R_Opnd(Reg_No r): RM_Opnd(Reg), _reg_no(r) {}
+    Reg_No  reg_no() const { return _reg_no; }
+
+private:
+    // disallow copying
+    R_Opnd(const R_Opnd &): RM_Opnd(Reg) { assert(false); }
+};
+
+//
+// a memory operand with displacement
+// Can also serve as a full memory operand with base,index, displacement and scale.
+// Use n_reg to specify 'no register', say, for index.
+class M_Opnd: public RM_Opnd {
+
+protected:
+    Imm_Opnd        m_disp;
+    Imm_Opnd        m_scale;
+    R_Opnd          m_index;
+    R_Opnd          m_base;
+
+public:
+    //M_Opnd(Opnd_Size sz): RM_Opnd(Mem, K_M, sz), m_disp(0), m_scale(0), m_index(n_reg), m_base(n_reg) {}
+    M_Opnd(I_32 disp):
+        RM_Opnd(Mem), m_disp(disp), m_scale(0), m_index(n_reg), m_base(n_reg) {}
+    M_Opnd(Reg_No rbase, I_32 rdisp):
+        RM_Opnd(Mem), m_disp(rdisp), m_scale(0), m_index(n_reg), m_base(rbase) {}
+    M_Opnd(I_32 disp, Reg_No rbase, Reg_No rindex, unsigned scale):
+        RM_Opnd(Mem), m_disp(disp), m_scale(scale), m_index(rindex), m_base(rbase) {}
+    M_Opnd(const M_Opnd & that) : RM_Opnd(Mem),
+        m_disp((int)that.m_disp.get_value()), m_scale((int)that.m_scale.get_value()),
+        m_index(that.m_index.reg_no()), m_base(that.m_base.reg_no())
+        {}
+    //
+    inline const R_Opnd & base(void) const { return m_base; }
+    inline const R_Opnd & index(void) const { return m_index; }
+    inline const Imm_Opnd & scale(void) const { return m_scale; }
+    inline const Imm_Opnd & disp(void) const { return m_disp; }
+};
+
+//
+//  a memory operand with base register and displacement
+//
+class M_Base_Opnd: public M_Opnd {
+
+public:
+    M_Base_Opnd(Reg_No base, I_32 disp) : M_Opnd(disp, base, n_reg, 0) {}
+
+private:
+    // disallow copying - but it leads to ICC errors #734 in encoder.inl
+    // M_Base_Opnd(const M_Base_Opnd &): M_Opnd(0) { assert(false); }
+};
+
+//
+//  a memory operand with base register, scaled index register
+//  and displacement.
+//
+class M_Index_Opnd : public M_Opnd {
+
+public:
+    M_Index_Opnd(Reg_No base, Reg_No index, I_32 disp, unsigned scale):
+        M_Opnd(disp, base, index, scale) {}
+
+private:
+    // disallow copying - but it leads to ICC errors #734 in encoder.inl
+    // M_Index_Opnd(const M_Index_Opnd &): M_Opnd(0) { assert(false); }
+};
+
+class XMM_Opnd : public Opnd {
+
+protected:
+    unsigned        m_idx;
+
+public:
+    XMM_Opnd(unsigned _idx): Opnd(XMM), m_idx(_idx) {};
+    unsigned get_idx( void ) const { return m_idx; };
+
+private:
+    // disallow copying
+    XMM_Opnd(const XMM_Opnd &): Opnd(XMM) { assert(false); }
+};
+
+//
+// operand structures for ia32 registers
+//
+#ifdef _EM64T_
+
+extern R_Opnd rax_opnd;
+extern R_Opnd rcx_opnd;
+extern R_Opnd rdx_opnd;
+extern R_Opnd rbx_opnd;
+extern R_Opnd rdi_opnd;
+extern R_Opnd rsi_opnd;
+extern R_Opnd rsp_opnd;
+extern R_Opnd rbp_opnd;
+
+extern R_Opnd r8_opnd;
+extern R_Opnd r9_opnd;
+extern R_Opnd r10_opnd;
+extern R_Opnd r11_opnd;
+extern R_Opnd r12_opnd;
+extern R_Opnd r13_opnd;
+extern R_Opnd r14_opnd;
+extern R_Opnd r15_opnd;
+
+extern XMM_Opnd xmm8_opnd;
+extern XMM_Opnd xmm9_opnd;
+extern XMM_Opnd xmm10_opnd;
+extern XMM_Opnd xmm11_opnd;
+extern XMM_Opnd xmm12_opnd;
+extern XMM_Opnd xmm13_opnd;
+extern XMM_Opnd xmm14_opnd;
+extern XMM_Opnd xmm15_opnd;
+#else
+
+extern R_Opnd eax_opnd;
+extern R_Opnd ecx_opnd;
+extern R_Opnd edx_opnd;
+extern R_Opnd ebx_opnd;
+extern R_Opnd esp_opnd;
+extern R_Opnd ebp_opnd;
+extern R_Opnd esi_opnd;
+extern R_Opnd edi_opnd;
+
+#endif // _EM64T_
+
+extern XMM_Opnd xmm0_opnd;
+extern XMM_Opnd xmm1_opnd;
+extern XMM_Opnd xmm2_opnd;
+extern XMM_Opnd xmm3_opnd;
+extern XMM_Opnd xmm4_opnd;
+extern XMM_Opnd xmm5_opnd;
+extern XMM_Opnd xmm6_opnd;
+extern XMM_Opnd xmm7_opnd;
+
+#ifdef NO_ENCODER_INLINE
+    #define ENCODER_DECLARE_EXPORT
+#else
+    #define ENCODER_DECLARE_EXPORT inline
+    #include "encoder.inl"
+#endif
+
+// prefix
+ENCODER_DECLARE_EXPORT char * prefix(char * stream, InstrPrefix p);
+
+// stack push and pop instructions
+ENCODER_DECLARE_EXPORT char * push(char * stream, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * push(char * stream, const Imm_Opnd & imm);
+ENCODER_DECLARE_EXPORT char * pop(char * stream,  const RM_Opnd & rm, Opnd_Size sz = size_platf);
+
+// cmpxchg or xchg
+ENCODER_DECLARE_EXPORT char * cmpxchg(char * stream, const RM_Opnd & rm, const R_Opnd & r, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * xchg(char * stream, const RM_Opnd & rm, const R_Opnd & r, Opnd_Size sz = size_platf);
+
+// inc(rement), dec(rement), not, neg(ate) instructions
+ENCODER_DECLARE_EXPORT char * inc(char * stream,  const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * dec(char * stream,  const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * _not(char * stream, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * neg(char * stream,  const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * nop(char * stream);
+ENCODER_DECLARE_EXPORT char * int3(char * stream);
+
+// alu instructions: add, or, adc, sbb, and, sub, xor, cmp
+ENCODER_DECLARE_EXPORT char * alu(char * stream, ALU_Opcode opc, const RM_Opnd & rm, const Imm_Opnd & imm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * alu(char * stream, ALU_Opcode opc, const M_Opnd & m, const R_Opnd & r, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * alu(char * stream, ALU_Opcode opc, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+
+// test instruction
+ENCODER_DECLARE_EXPORT char * test(char * stream, const RM_Opnd & rm, const Imm_Opnd & imm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * test(char * stream, const RM_Opnd & rm, const R_Opnd & r, Opnd_Size sz = size_platf);
+
+// shift instructions: shl, shr, sar, shld, shrd, ror
+ENCODER_DECLARE_EXPORT char * shift(char * stream, Shift_Opcode opc, const RM_Opnd & rm, const Imm_Opnd & imm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * shift(char * stream, Shift_Opcode opc, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * shift(char * stream, Shift_Opcode opc, const RM_Opnd & rm, const R_Opnd & r, const Imm_Opnd & imm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * shift(char * stream, Shift_Opcode opc, const RM_Opnd & rm, const R_Opnd & r, Opnd_Size sz = size_platf);
+
+// multiply instructions: mul, imul
+ENCODER_DECLARE_EXPORT char * mul(char * stream, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * imul(char * stream, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * imul(char * stream, const R_Opnd & r, const Imm_Opnd & imm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * imul(char * stream, const R_Opnd & r, const RM_Opnd & rm, const Imm_Opnd& imm, Opnd_Size sz = size_platf);
+
+// divide instructions: div, idiv
+ENCODER_DECLARE_EXPORT char * idiv(char * stream, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * div(char * stream, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+
+// data movement: mov
+ENCODER_DECLARE_EXPORT char * mov(char * stream, const M_Opnd & m,  const R_Opnd & r, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * mov(char * stream, const R_Opnd & r,  const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * mov(char * stream, const RM_Opnd & rm, const Imm_Opnd & imm, Opnd_Size sz = size_platf);
+
+ENCODER_DECLARE_EXPORT char * movsx( char * stream, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * movzx( char * stream, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+
+ENCODER_DECLARE_EXPORT char * movd(char * stream, const RM_Opnd & rm, const XMM_Opnd & xmm);
+ENCODER_DECLARE_EXPORT char * movd(char * stream, const XMM_Opnd & xmm, const RM_Opnd & rm);
+ENCODER_DECLARE_EXPORT char * movq(char * stream, const RM_Opnd & rm, const XMM_Opnd & xmm);
+ENCODER_DECLARE_EXPORT char * movq(char * stream, const XMM_Opnd & xmm, const RM_Opnd & rm);
+
+// sse mov
+ENCODER_DECLARE_EXPORT char * sse_mov(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_mov(char * stream, const M_Opnd & mem, const XMM_Opnd & xmm, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_mov(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl);
+
+// sse add, sub, mul, div
+ENCODER_DECLARE_EXPORT char * sse_add(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_add(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl);
+
+ENCODER_DECLARE_EXPORT char * sse_sub(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_sub(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl);
+
+ENCODER_DECLARE_EXPORT char * sse_mul(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_mul(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl);
+
+ENCODER_DECLARE_EXPORT char * sse_div(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_div(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl);
+
+// xor, compare
+ENCODER_DECLARE_EXPORT char * sse_xor(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1);
+
+ENCODER_DECLARE_EXPORT char * sse_compare(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_compare(char * stream, const XMM_Opnd & xmm0, const M_Opnd & mem, bool dbl);
+
+// sse conversions
+ENCODER_DECLARE_EXPORT char * sse_cvt_si(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_cvtt2si(char * stream, const R_Opnd & reg, const M_Opnd & mem, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_cvtt2si(char * stream, const R_Opnd & reg, const XMM_Opnd & xmm, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_cvt_fp2dq(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_cvt_dq2fp(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl);
+ENCODER_DECLARE_EXPORT char * sse_d2s(char * stream, const XMM_Opnd & xmm0, const M_Opnd & mem64);
+ENCODER_DECLARE_EXPORT char * sse_d2s(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1);
+ENCODER_DECLARE_EXPORT char * sse_s2d(char * stream, const XMM_Opnd & xmm0, const M_Opnd & mem32);
+ENCODER_DECLARE_EXPORT char * sse_s2d(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1);
+
+// condition operations
+ENCODER_DECLARE_EXPORT char * cmov(char * stream, ConditionCode cc, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * setcc(char * stream, ConditionCode cc, const RM_Opnd & rm8);
+
+// load effective address: lea
+ENCODER_DECLARE_EXPORT char * lea(char * stream, const R_Opnd & r, const M_Opnd & m, Opnd_Size sz = size_platf);
+ENCODER_DECLARE_EXPORT char * cdq(char * stream);
+ENCODER_DECLARE_EXPORT char * wait(char * stream);
+
+// control-flow instructions
+ENCODER_DECLARE_EXPORT char * loop(char * stream, const Imm_Opnd & imm);
+
+// jump with 8-bit relative
+ENCODER_DECLARE_EXPORT char * jump8(char * stream, const Imm_Opnd & imm);
+
+// jump with 32-bit relative
+ENCODER_DECLARE_EXPORT char * jump32(char * stream, const Imm_Opnd & imm);
+
+// register indirect jump
+ENCODER_DECLARE_EXPORT char * jump(char * stream, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+
+// jump to target address
+ENCODER_DECLARE_EXPORT char *jump(char * stream, char *target);
+
+// jump with displacement
+//char * jump(char * stream, I_32 disp);
+
+// conditional branch with 8-bit branch offset
+ENCODER_DECLARE_EXPORT char * branch8(char * stream, ConditionCode cc, const Imm_Opnd & imm, InstrPrefix prefix = no_prefix);
+
+// conditional branch with 32-bit branch offset
+ENCODER_DECLARE_EXPORT char * branch32(char * stream, ConditionCode cc, const Imm_Opnd & imm, InstrPrefix prefix = no_prefix);
+
+// conditional branch with target label address
+//char * branch(char * stream, ConditionCode cc, const char * target, InstrPrefix prefix = no_prefix);
+
+// conditional branch with displacement immediate
+ENCODER_DECLARE_EXPORT char * branch(char * stream, ConditionCode cc, I_32 disp, InstrPrefix prefix = no_prefix);
+
+// call with displacement
+ENCODER_DECLARE_EXPORT char * call(char * stream, const Imm_Opnd & imm);
+
+// indirect call through register or memory location
+ENCODER_DECLARE_EXPORT char * call(char * stream, const RM_Opnd & rm, Opnd_Size sz = size_platf);
+
+// call target address
+ENCODER_DECLARE_EXPORT char * call(char * stream, const char * target);
+
+// return instruction
+ENCODER_DECLARE_EXPORT char * ret(char * stream);
+ENCODER_DECLARE_EXPORT char * ret(char * stream, unsigned short pop);
+ENCODER_DECLARE_EXPORT char * ret(char * stream, const Imm_Opnd & imm);
+
+// string operations
+ENCODER_DECLARE_EXPORT char * set_d(char * stream, bool set);
+ENCODER_DECLARE_EXPORT char * scas(char * stream, unsigned char prefix);
+ENCODER_DECLARE_EXPORT char * stos(char * stream, unsigned char prefix);
+
+// floating-point instructions
+
+// st(0) = st(0) fp_op m{32,64}real
+//!char * fp_op_mem(char * stream, FP_Opcode opc,const M_Opnd& mem,int is_double);
+
+// st(0) = st(0) fp_op st(i)
+//!char *fp_op(char * stream, FP_Opcode opc,unsigned i);
+
+// st(i) = st(i) fp_op st(0)    ; optionally pop stack
+//!char * fp_op(char * stream, FP_Opcode opc,unsigned i,unsigned pop_stk);
+
+// compare st(0),st(1) and pop stack twice
+//!char * fcompp(char * stream);
+ENCODER_DECLARE_EXPORT char * fldcw(char * stream, const M_Opnd & mem);
+ENCODER_DECLARE_EXPORT char * fnstcw(char * stream, const M_Opnd & mem);
+ENCODER_DECLARE_EXPORT char * fnstsw(char * stream);
+//!char * fchs(char * stream);
+//!char * frem(char * stream);
+//!char * fxch(char * stream,unsigned i);
+//!char * fcomip(char * stream, unsigned i);
+
+// load from memory (as fp) into fp register stack
+ENCODER_DECLARE_EXPORT char * fld(char * stream, const M_Opnd & m, bool is_double);
+//!char *fld80(char * stream,const M_Opnd& mem);
+
+// load from memory (as int) into fp register stack
+//!char * fild(char * stream,const M_Opnd& mem,int is_long);
+
+// push st(i) onto fp register stack
+//!char * fld(char * stream,unsigned i);
+
+// push the constants 0.0 and 1.0 onto the fp register stack
+//!char * fldz(char * stream);
+//!char * fld1(char * stream);
+
+// store stack to memory (as int), always popping the stack
+ENCODER_DECLARE_EXPORT char * fist(char * stream, const M_Opnd & mem, bool is_long, bool pop_stk);
+// store stack to to memory (as fp), optionally popping the stack
+ENCODER_DECLARE_EXPORT char * fst(char * stream, const M_Opnd & m, bool is_double, bool pop_stk);
+// store ST(0) to ST(i), optionally popping the stack. Takes 1 clock
+ENCODER_DECLARE_EXPORT char * fst(char * stream, unsigned i, bool pop_stk);
+
+//!char * pushad(char * stream);
+//!char * pushfd(char * stream);
+//!char * popad(char * stream);
+//!char * popfd(char * stream);
+
+// stack frame allocation instructions: enter & leave
+//
+//    enter frame_size
+//
+//    is equivalent to:
+//
+//    push    ebp
+//    mov     ebp,esp
+//    sub     esp,frame_size
+//
+//!char *enter(char * stream,const Imm_Opnd& imm);
+
+// leave
+// is equivalent to:
+//
+// mov        esp,ebp
+// pop        ebp
+//!char *leave(char * stream);
+
+// sahf  loads SF, ZF, AF, PF, and CF flags from eax
+//!char *sahf(char * stream);
+
+// Intrinsic FP math functions
+
+//!char *math_fsin(char * stream);
+//!char *math_fcos(char * stream);
+//!char *math_fabs(char * stream);
+//!char *math_fpatan(char * stream);
+ENCODER_DECLARE_EXPORT char * fprem(char * stream);
+ENCODER_DECLARE_EXPORT char * fprem1(char * stream);
+//!char *math_frndint(char * stream);
+//!char *math_fptan(char * stream);
+
+//
+// Add 1-7 bytes padding, with as few instructions as possible,
+// with no effect on the processor state (e.g., registers, flags)
+//
+//!char *padding(char * stream, unsigned num);
+
+// prolog and epilog code generation
+//- char *prolog(char * stream,unsigned frame_size,unsigned reg_save_mask);
+//- char *epilog(char * stream,unsigned reg_save_mask);
+
+//!extern R_Opnd reg_operand_array[];
+
+// fsave and frstor
+//!char *fsave(char * stream);
+//!char *frstor(char * stream);
+
+// lahf : Load Status Flags into AH Register
+//!char *lahf(char * stream);
+
+// mfence : Memory Fence
+//!char *mfence(char * stream);
+
+#endif // _VM_ENCODER_H_
diff --git a/libpixelflinger/codeflinger/x86/libenc/encoder.inl b/libpixelflinger/codeflinger/x86/libenc/encoder.inl
new file mode 100644
index 0000000..ec72097
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/libenc/encoder.inl
@@ -0,0 +1,863 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+/**
+ * @author Alexander V. Astapchuk
+ */
+#include <stdio.h>
+#include <assert.h>
+#include <limits.h>
+
+extern const RegName map_of_regno_2_regname[];
+extern const OpndSize map_of_EncoderOpndSize_2_RealOpndSize[];
+extern const Mnemonic map_of_alu_opcode_2_mnemonic[];
+extern const Mnemonic map_of_shift_opcode_2_mnemonic[];
+
+// S_ stands for 'Signed'
+extern const Mnemonic S_map_of_condition_code_2_branch_mnemonic[];
+// U_ stands for 'Unsigned'
+extern const Mnemonic U_map_of_condition_code_2_branch_mnemonic[];
+
+inline static RegName map_reg(Reg_No r) {
+    assert(r >= 0 && r <= n_reg);
+    return map_of_regno_2_regname[r];
+}
+
+inline static OpndSize map_size(Opnd_Size o_size) {
+    assert(o_size >= 0 && o_size <= n_size);
+    return map_of_EncoderOpndSize_2_RealOpndSize[o_size];
+}
+
+inline static Mnemonic map_alu(ALU_Opcode alu) {
+    assert(alu >= 0 && alu < n_alu);
+    return map_of_alu_opcode_2_mnemonic[alu];
+}
+
+inline static Mnemonic map_shift(Shift_Opcode shc) {
+    assert(shc >= 0 && shc < n_shift);
+    return map_of_shift_opcode_2_mnemonic[shc];
+}
+
+inline bool fit8(int64 val) {
+    return (CHAR_MIN <= val) && (val <= CHAR_MAX);
+}
+
+inline bool fit32(int64 val) {
+    return (INT_MIN <= val) && (val <= INT_MAX);
+}
+
+inline static void add_r(EncoderBase::Operands & args, const R_Opnd & r, Opnd_Size sz, OpndExt ext = OpndExt_None) {
+    RegName reg = map_reg(r.reg_no());
+    if (sz != n_size) {
+        OpndSize size = map_size(sz);
+        if (size != getRegSize(reg)) {
+            reg = getAliasReg(reg, size);
+        }
+    }
+    args.add(EncoderBase::Operand(reg, ext));
+}
+
+inline static void add_m(EncoderBase::Operands & args, const M_Opnd & m, Opnd_Size sz, OpndExt ext = OpndExt_None) {
+        assert(n_size != sz);
+        args.add(EncoderBase::Operand(map_size(sz),
+            map_reg(m.base().reg_no()), map_reg(m.index().reg_no()),
+            (unsigned)m.scale().get_value(), (int)m.disp().get_value(), ext));
+}
+
+inline static void add_rm(EncoderBase::Operands & args, const RM_Opnd & rm, Opnd_Size sz, OpndExt ext = OpndExt_None) {
+    rm.is_reg() ? add_r(args, (R_Opnd &)rm, sz, ext) : add_m(args, (M_Opnd &)rm, sz, ext);
+}
+
+inline static void add_xmm(EncoderBase::Operands & args, const XMM_Opnd & xmm, bool dbl) {
+    // Gregory -
+    // XMM registers indexes in Reg_No enum are shifted by xmm0_reg, their indexes
+    // don't start with 0, so it is necessary to subtract xmm0_reg index from
+    // xmm.get_idx() value
+    assert(xmm.get_idx() >= xmm0_reg);
+    return args.add((RegName)( (dbl ? RegName_XMM0D : RegName_XMM0S) + xmm.get_idx() -
+            xmm0_reg));
+}
+
+inline static void add_fp(EncoderBase::Operands & args, unsigned i, bool dbl) {
+    return args.add((RegName)( (dbl ? RegName_FP0D : RegName_FP0S) + i));
+}
+
+inline static void add_imm(EncoderBase::Operands & args, const Imm_Opnd & imm) {
+    assert(n_size != imm.get_size());
+    args.add(EncoderBase::Operand(map_size(imm.get_size()), imm.get_value(),
+        imm.is_signed() ? OpndExt_Signed : OpndExt_Zero));
+}
+
+ENCODER_DECLARE_EXPORT char * prefix(char * stream, InstrPrefix p) {
+    *stream = (char)p;
+    return stream + 1;
+}
+
+// stack push and pop instructions
+ENCODER_DECLARE_EXPORT char * push(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_PUSH, args);
+}
+
+ENCODER_DECLARE_EXPORT char * push(char * stream, const Imm_Opnd & imm) {
+    EncoderBase::Operands args;
+#ifdef _EM64T_
+    add_imm(args, imm);
+#else
+    // we need this workaround to be compatible with the former ia32 encoder implementation
+    add_imm(args, Imm_Opnd(size_32, imm.get_value()));
+#endif
+    return EncoderBase::encode(stream, Mnemonic_PUSH, args);
+}
+
+ENCODER_DECLARE_EXPORT char * pop(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_POP, args);
+}
+
+// cmpxchg or xchg
+ENCODER_DECLARE_EXPORT char * cmpxchg(char * stream, const RM_Opnd & rm, const R_Opnd & r, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    add_r(args, r, sz);
+    RegName implicitReg = getAliasReg(RegName_EAX, map_size(sz));
+    args.add(implicitReg);
+    return (char*)EncoderBase::encode(stream, Mnemonic_CMPXCHG, args);
+}
+
+ENCODER_DECLARE_EXPORT char * xchg(char * stream, const RM_Opnd & rm, const R_Opnd & r, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    add_r(args, r, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_XCHG, args);
+}
+
+// inc(rement), dec(rement), not, neg(ate) instructions
+ENCODER_DECLARE_EXPORT char * inc(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_INC, args);
+}
+
+ENCODER_DECLARE_EXPORT char * dec(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_DEC, args);
+}
+
+ENCODER_DECLARE_EXPORT char * _not(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_NOT, args);
+}
+
+ENCODER_DECLARE_EXPORT char * neg(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_NEG, args);
+}
+
+ENCODER_DECLARE_EXPORT char * nop(char * stream) {
+    EncoderBase::Operands args;
+    return (char*)EncoderBase::encode(stream, Mnemonic_NOP, args);
+}
+
+ENCODER_DECLARE_EXPORT char * int3(char * stream) {
+    EncoderBase::Operands args;
+    return (char*)EncoderBase::encode(stream, Mnemonic_INT3, args);
+}
+
+// alu instructions: add, or, adc, sbb, and, sub, xor, cmp
+ENCODER_DECLARE_EXPORT char * alu(char * stream, ALU_Opcode opc, const RM_Opnd & rm, const Imm_Opnd & imm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, map_alu(opc), args);
+};
+
+ENCODER_DECLARE_EXPORT char * alu(char * stream, ALU_Opcode opc, const M_Opnd & m, const R_Opnd & r, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, m, sz);
+    add_rm(args, r, sz);
+    return (char*)EncoderBase::encode(stream, map_alu(opc), args);
+}
+
+ENCODER_DECLARE_EXPORT char * alu(char * stream, ALU_Opcode opc, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, r, sz);
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, map_alu(opc), args);
+}
+
+// test instruction
+ENCODER_DECLARE_EXPORT char * test(char * stream, const RM_Opnd & rm, const Imm_Opnd & imm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    assert(imm.get_size() <= sz);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, Mnemonic_TEST, args);
+}
+
+ENCODER_DECLARE_EXPORT char * test(char * stream, const RM_Opnd & rm, const R_Opnd & r, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    add_r(args, r, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_TEST, args);
+}
+
+// shift instructions: shl, shr, sar, shld, shrd
+ENCODER_DECLARE_EXPORT char * shift(char * stream, Shift_Opcode shc, const RM_Opnd & rm, const Imm_Opnd & imm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, map_shift(shc), args);
+}
+
+ENCODER_DECLARE_EXPORT char * shift(char * stream, Shift_Opcode shc, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    args.add(RegName_CL);
+    return (char*)EncoderBase::encode(stream, map_shift(shc), args);
+}
+
+ENCODER_DECLARE_EXPORT char * shift(char * stream, Shift_Opcode shc, const RM_Opnd & rm,
+                            const R_Opnd & r, const Imm_Opnd & imm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    assert(shc == shld_opc || shc == shrd_opc);
+    add_rm(args, rm, sz);
+    add_r(args, r, sz);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, map_shift(shc), args);
+}
+
+ENCODER_DECLARE_EXPORT char * shift(char * stream, Shift_Opcode shc, const RM_Opnd & rm,
+                            const R_Opnd & r, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    assert(shc == shld_opc || shc == shrd_opc);
+    add_rm(args, rm, sz);
+    add_r(args, r, sz);
+    args.add(RegName_CL);
+    return (char*)EncoderBase::encode(stream, map_shift(shc), args);
+}
+
+// multiply instructions: mul, imul
+ENCODER_DECLARE_EXPORT char * mul(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    args.add(RegName_EDX);
+    args.add(RegName_EAX);
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MUL, args);
+}
+
+ENCODER_DECLARE_EXPORT char * imul(char * stream, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_r(args, r, sz);
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_IMUL, args);
+}
+
+ENCODER_DECLARE_EXPORT char * imul(char * stream, const R_Opnd & r, const Imm_Opnd & imm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_r(args, r, sz);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, Mnemonic_IMUL, args);
+}
+
+ENCODER_DECLARE_EXPORT char * imul(char * stream, const R_Opnd & r, const RM_Opnd & rm,
+                           const Imm_Opnd & imm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_r(args, r, sz);
+    add_rm(args, rm, sz);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, Mnemonic_IMUL, args);
+}
+
+// divide instructions: div, idiv
+ENCODER_DECLARE_EXPORT char * idiv(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+#ifdef _EM64T_
+    add_r(args, rdx_opnd, sz);
+    add_r(args, rax_opnd, sz);
+#else
+    add_r(args, edx_opnd, sz);
+    add_r(args, eax_opnd, sz);
+#endif
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_IDIV, args);
+}
+
+ENCODER_DECLARE_EXPORT char * div(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+#ifdef _EM64T_
+    add_r(args, rdx_opnd, sz);
+    add_r(args, rax_opnd, sz);
+#else
+    add_r(args, edx_opnd, sz);
+    add_r(args, eax_opnd, sz);
+#endif
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_DIV, args);
+}
+
+// data movement: mov
+ENCODER_DECLARE_EXPORT char * mov(char * stream, const M_Opnd & m, const R_Opnd & r, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_m(args, m, sz);
+    add_r(args, r, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOV, args);
+}
+
+ENCODER_DECLARE_EXPORT char * mov(char * stream, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_r(args, r, sz);
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOV, args);
+}
+
+ENCODER_DECLARE_EXPORT char * mov(char * stream, const RM_Opnd & rm, const Imm_Opnd & imm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOV, args);
+}
+
+ENCODER_DECLARE_EXPORT char * movd(char * stream, const RM_Opnd & rm, const XMM_Opnd & xmm) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, size_32);
+    add_xmm(args, xmm, false);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOVD, args);
+}
+
+ENCODER_DECLARE_EXPORT char * movd(char * stream, const XMM_Opnd & xmm, const RM_Opnd & rm) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm, false);
+    add_rm(args, rm, size_32);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOVD, args);
+}
+
+ENCODER_DECLARE_EXPORT char * movq(char * stream, const RM_Opnd & rm, const XMM_Opnd & xmm) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, size_64);
+    add_xmm(args, xmm, true);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOVQ, args);
+}
+
+ENCODER_DECLARE_EXPORT char * movq(char * stream, const XMM_Opnd & xmm, const RM_Opnd & rm) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm, true);
+    add_rm(args, rm, size_64);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOVQ, args);
+}
+
+ENCODER_DECLARE_EXPORT char * movsx(char * stream, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_r(args, r, n_size);
+    add_rm(args, rm, sz, OpndExt_Signed);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOVSX, args);
+}
+
+ENCODER_DECLARE_EXPORT char * movzx(char * stream, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_r(args, r, n_size);
+    // movzx r64, r/m32 is not available on em64t
+    // mov r32, r/m32 should zero out upper bytes
+    assert(sz <= size_16);
+    add_rm(args, rm, sz, OpndExt_Zero);
+    return (char*)EncoderBase::encode(stream, Mnemonic_MOVZX, args);
+}
+
+// sse mov
+ENCODER_DECLARE_EXPORT char * sse_mov(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm, dbl);
+    add_m(args, mem, dbl ? size_64 : size_32);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_MOVSD : Mnemonic_MOVSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_mov(char * stream, const M_Opnd &  mem, const XMM_Opnd & xmm, bool dbl) {
+    EncoderBase::Operands args;
+    add_m(args, mem, dbl ? size_64 : size_32);
+    add_xmm(args, xmm, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_MOVSD : Mnemonic_MOVSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_mov(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, dbl);
+    add_xmm(args, xmm1, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_MOVSD : Mnemonic_MOVSS, args );
+}
+
+// sse add, sub, mul, div
+ENCODER_DECLARE_EXPORT char * sse_add(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm, dbl);
+    add_m(args, mem, dbl ? size_64 : size_32);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_ADDSD : Mnemonic_ADDSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_add(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, dbl);
+    add_xmm(args, xmm1, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_ADDSD : Mnemonic_ADDSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_sub(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm, dbl);
+    add_m(args, mem, dbl ? size_64 : size_32);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_SUBSD : Mnemonic_SUBSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_sub(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, dbl);
+    add_xmm(args, xmm1, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_SUBSD : Mnemonic_SUBSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_mul( char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm, dbl);
+    add_m(args, mem, dbl ? size_64 : size_32);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_MULSD : Mnemonic_MULSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_mul(char * stream, const XMM_Opnd& xmm0, const XMM_Opnd& xmm1, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args,  xmm0, dbl);
+    add_xmm(args,  xmm1, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_MULSD : Mnemonic_MULSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_div(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm, dbl);
+    add_m(args, mem, dbl ? size_64 : size_32);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_DIVSD : Mnemonic_DIVSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_div(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, dbl);
+    add_xmm(args, xmm1, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_DIVSD : Mnemonic_DIVSS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_xor(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, true);
+    add_xmm(args, xmm1, true);
+    return (char*)EncoderBase::encode(stream, Mnemonic_PXOR, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_compare(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, true);
+    add_xmm(args, xmm1, true);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_COMISD : Mnemonic_COMISS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_compare(char * stream, const XMM_Opnd & xmm0, const M_Opnd & mem, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, dbl);
+    add_m(args, mem, dbl ? size_64 : size_32);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_COMISD : Mnemonic_COMISS, args);
+}
+
+// sse conversions
+ENCODER_DECLARE_EXPORT char * sse_cvt_si(char * stream, const XMM_Opnd & xmm, const M_Opnd & mem, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm, dbl);
+    add_m(args, mem, size_32);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_CVTSI2SD : Mnemonic_CVTSI2SS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_cvtt2si(char * stream, const R_Opnd & reg, const M_Opnd & mem, bool dbl) {
+    EncoderBase::Operands args;
+    add_rm(args, reg, size_32);
+    add_m(args, mem, dbl ? size_64 : size_32);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_CVTTSD2SI : Mnemonic_CVTTSS2SI, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_cvtt2si(char * stream, const R_Opnd & reg, const XMM_Opnd & xmm, bool dbl) {
+    EncoderBase::Operands args;
+    add_rm(args, reg, size_32);
+    add_xmm(args, xmm, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ? Mnemonic_CVTTSD2SI : Mnemonic_CVTTSS2SI, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_cvt_fp2dq(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, dbl);
+    add_xmm(args, xmm1, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ?  Mnemonic_CVTTPD2DQ : Mnemonic_CVTTPS2DQ, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_cvt_dq2fp(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1, bool dbl) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, dbl);
+    add_xmm(args, xmm1, dbl);
+    return (char*)EncoderBase::encode(stream, dbl ?  Mnemonic_CVTDQ2PD : Mnemonic_CVTDQ2PS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_d2s(char * stream, const XMM_Opnd & xmm0, const M_Opnd & mem64) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, false);
+    add_m(args, mem64, size_64);
+    return (char*)EncoderBase::encode(stream, Mnemonic_CVTSD2SS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_d2s(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, false);
+    add_xmm(args, xmm1, true);
+    return (char*)EncoderBase::encode(stream, Mnemonic_CVTSD2SS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_s2d(char * stream, const XMM_Opnd & xmm0, const M_Opnd & mem32) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, true);
+    add_m(args, mem32, size_32);
+    return (char*)EncoderBase::encode(stream, Mnemonic_CVTSS2SD, args);
+}
+
+ENCODER_DECLARE_EXPORT char * sse_s2d(char * stream, const XMM_Opnd & xmm0, const XMM_Opnd & xmm1) {
+    EncoderBase::Operands args;
+    add_xmm(args, xmm0, true);
+    add_xmm(args, xmm1, false);
+    return (char*)EncoderBase::encode(stream, Mnemonic_CVTSS2SD, args);
+}
+
+// condition operations
+ENCODER_DECLARE_EXPORT char *cmov(char * stream, ConditionCode cc, const R_Opnd & r, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_r(args, r, sz);
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, (Mnemonic)(Mnemonic_CMOVcc + cc), args);
+}
+
+ENCODER_DECLARE_EXPORT char * setcc(char * stream, ConditionCode cc, const RM_Opnd & rm8) {
+    EncoderBase::Operands args;
+    add_rm(args, rm8, size_8);
+    return (char*)EncoderBase::encode(stream, (Mnemonic)(Mnemonic_SETcc + cc), args);
+}
+
+// load effective address: lea
+ENCODER_DECLARE_EXPORT char * lea(char * stream, const R_Opnd & r, const M_Opnd & m, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_r(args, r, sz);
+    add_m(args, m, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_LEA, args);
+}
+
+ENCODER_DECLARE_EXPORT char * cdq(char * stream) {
+    EncoderBase::Operands args;
+    args.add(RegName_EDX);
+    args.add(RegName_EAX);
+    return (char*)EncoderBase::encode(stream, Mnemonic_CDQ, args);
+}
+
+ENCODER_DECLARE_EXPORT char * wait(char * stream) {
+    return (char*)EncoderBase::encode(stream, Mnemonic_WAIT, EncoderBase::Operands());
+}
+
+// control-flow instructions
+
+// loop
+ENCODER_DECLARE_EXPORT char * loop(char * stream, const Imm_Opnd & imm) {
+    EncoderBase::Operands args;
+    assert(imm.get_size() == size_8);
+    args.add(RegName_ECX);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, Mnemonic_LOOP, args);
+}
+
+// jump
+ENCODER_DECLARE_EXPORT char * jump8(char * stream, const Imm_Opnd & imm) {
+    EncoderBase::Operands args;
+    assert(imm.get_size() == size_8);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, Mnemonic_JMP, args);
+}
+
+ENCODER_DECLARE_EXPORT char * jump32(char * stream, const Imm_Opnd & imm) {
+    EncoderBase::Operands args;
+    assert(imm.get_size() == size_32);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, Mnemonic_JMP, args);
+}
+
+ENCODER_DECLARE_EXPORT char * jump(char * stream, const RM_Opnd & rm, Opnd_Size sz) {
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_JMP, args);
+}
+
+/**
+ * @note On EM64T: if target lies beyond 2G (does not fit into 32 bit
+ *       offset) then generates indirect jump using RAX (whose content is
+ *       destroyed).
+ */
+ENCODER_DECLARE_EXPORT char * jump(char * stream, char * target) {
+#ifdef _EM64T_
+    int64 offset = target - stream;
+    // sub 2 bytes for the short version
+    offset -= 2;
+    if (fit8(offset)) {
+        // use 8-bit signed relative form
+        return jump8(stream, Imm_Opnd(size_8, offset));
+    } else if (fit32(offset)) {
+        // sub 5 (3 + 2)bytes for the long version
+        offset -= 3;
+        // use 32-bit signed relative form
+        return jump32(stream, Imm_Opnd(size_32, offset));
+    }
+    // need to use absolute indirect jump
+    stream = mov(stream, rax_opnd, Imm_Opnd(size_64, (int64)target), size_64);
+    return jump(stream, rax_opnd, size_64);
+#else
+    I_32 offset = target - stream;
+    // sub 2 bytes for the short version
+    offset -= 2;
+    if (fit8(offset)) {
+        // use 8-bit signed relative form
+        return jump8(stream, Imm_Opnd(size_8, offset));
+    }
+    // sub 5 (3 + 2) bytes for the long version
+    offset -= 3;
+    // use 32-bit signed relative form
+    return jump32(stream, Imm_Opnd(size_32, offset));
+#endif
+}
+
+// branch
+ENCODER_DECLARE_EXPORT char * branch8(char * stream, ConditionCode cond,
+                                      const Imm_Opnd & imm,
+                                      InstrPrefix pref)
+{
+    if (pref != no_prefix) {
+        assert(pref == hint_branch_taken_prefix || pref == hint_branch_taken_prefix);
+        stream = prefix(stream, pref);
+    }
+    Mnemonic m = (Mnemonic)(Mnemonic_Jcc + cond);
+    EncoderBase::Operands args;
+    assert(imm.get_size() == size_8);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, m, args);
+}
+
+ENCODER_DECLARE_EXPORT char * branch32(char * stream, ConditionCode cond,
+                                       const Imm_Opnd & imm,
+                                       InstrPrefix pref)
+{
+    if (pref != no_prefix) {
+        assert(pref == hint_branch_taken_prefix || pref == hint_branch_taken_prefix);
+        stream = prefix(stream, pref);
+    }
+    Mnemonic m = (Mnemonic)(Mnemonic_Jcc + cond);
+    EncoderBase::Operands args;
+    assert(imm.get_size() == size_32);
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, m, args);
+}
+
+/*
+ENCODER_DECLARE_EXPORT char * branch(char * stream, ConditionCode cc, const char * target, InstrPrefix prefix) {
+// sub 2 bytes for the short version
+int64 offset = stream-target-2;
+if( fit8(offset) ) {
+return branch8(stream, cc, Imm_Opnd(size_8, (char)offset), is_signed);
+}
+return branch32(stream, cc, Imm_Opnd(size_32, (int)offset), is_signed);
+}
+*/
+
+// call
+ENCODER_DECLARE_EXPORT char * call(char * stream, const Imm_Opnd & imm)
+{
+    EncoderBase::Operands args;
+    add_imm(args, imm);
+    return (char*)EncoderBase::encode(stream, Mnemonic_CALL, args);
+}
+
+ENCODER_DECLARE_EXPORT char * call(char * stream, const RM_Opnd & rm,
+                                   Opnd_Size sz)
+{
+    EncoderBase::Operands args;
+    add_rm(args, rm, sz);
+    return (char*)EncoderBase::encode(stream, Mnemonic_CALL, args);
+}
+
+/**
+* @note On EM64T: if target lies beyond 2G (does not fit into 32 bit
+*       offset) then generates indirect jump using RAX (whose content is
+*       destroyed).
+*/
+ENCODER_DECLARE_EXPORT char * call(char * stream, const char * target)
+{
+#ifdef _EM64T_
+    int64 offset = target - stream;
+    if (fit32(offset)) {
+        offset -= 5; // sub 5 bytes for this instruction
+        Imm_Opnd imm(size_32, offset);
+        return call(stream, imm);
+    }
+    // need to use absolute indirect call
+    stream = mov(stream, rax_opnd, Imm_Opnd(size_64, (int64)target), size_64);
+    return call(stream, rax_opnd, size_64);
+#else
+    I_32 offset = target - stream;
+    offset -= 5; // sub 5 bytes for this instruction
+    Imm_Opnd imm(size_32, offset);
+    return call(stream, imm);
+#endif
+}
+
+// return instruction
+ENCODER_DECLARE_EXPORT char * ret(char * stream)
+{
+    EncoderBase::Operands args;
+    return (char*)EncoderBase::encode(stream, Mnemonic_RET, args);
+}
+
+ENCODER_DECLARE_EXPORT char * ret(char * stream, const Imm_Opnd & imm)
+{
+    EncoderBase::Operands args;
+    // TheManual says imm can be 16-bit only
+    //assert(imm.get_size() <= size_16);
+    args.add(EncoderBase::Operand(map_size(size_16), imm.get_value()));
+    return (char*)EncoderBase::encode(stream, Mnemonic_RET, args);
+}
+
+ENCODER_DECLARE_EXPORT char * ret(char * stream, unsigned short pop)
+{
+    // TheManual says it can only be imm16
+    EncoderBase::Operands args(EncoderBase::Operand(OpndSize_16, pop, OpndExt_Zero));
+    return (char*)EncoderBase::encode(stream, Mnemonic_RET, args);
+}
+
+// floating-point instructions
+ENCODER_DECLARE_EXPORT char * fld(char * stream, const M_Opnd & m,
+                                  bool is_double) {
+    EncoderBase::Operands args;
+    // a fake FP register as operand
+    add_fp(args, 0, is_double);
+    add_m(args, m, is_double ? size_64 : size_32);
+    return (char*)EncoderBase::encode(stream, Mnemonic_FLD, args);
+}
+
+ENCODER_DECLARE_EXPORT char * fist(char * stream, const M_Opnd & mem,
+                                   bool is_long, bool pop_stk)
+{
+    EncoderBase::Operands args;
+    if (pop_stk) {
+        add_m(args, mem, is_long ? size_64 : size_32);
+        // a fake FP register as operand
+        add_fp(args, 0, is_long);
+        return (char*)EncoderBase::encode(stream,  Mnemonic_FISTP, args);
+    }
+    // only 32-bit operands are supported
+    assert(is_long == false);
+    add_m(args, mem, size_32);
+    add_fp(args, 0, false);
+    return (char*)EncoderBase::encode(stream,  Mnemonic_FIST, args);
+}
+
+ENCODER_DECLARE_EXPORT char * fst(char * stream, const M_Opnd & m,
+                                  bool is_double, bool pop_stk)
+{
+    EncoderBase::Operands args;
+    add_m(args, m, is_double ? size_64 : size_32);
+    // a fake FP register as operand
+    add_fp(args, 0, is_double);
+    return (char*)EncoderBase::encode(stream,
+                                    pop_stk ? Mnemonic_FSTP : Mnemonic_FST,
+                                    args);
+}
+
+ENCODER_DECLARE_EXPORT char * fst(char * stream, unsigned i, bool pop_stk)
+{
+    EncoderBase::Operands args;
+    add_fp(args, i, true);
+    return (char*)EncoderBase::encode(stream,
+                                    pop_stk ? Mnemonic_FSTP : Mnemonic_FST,
+                                    args);
+}
+
+ENCODER_DECLARE_EXPORT char * fldcw(char * stream, const M_Opnd & mem) {
+    EncoderBase::Operands args;
+    add_m(args, mem, size_16);
+    return (char*)EncoderBase::encode(stream, Mnemonic_FLDCW, args);
+}
+
+ENCODER_DECLARE_EXPORT char * fnstcw(char * stream, const M_Opnd & mem) {
+    EncoderBase::Operands args;
+    add_m(args, mem, size_16);
+    return (char*)EncoderBase::encode(stream, Mnemonic_FNSTCW, args);
+}
+
+ENCODER_DECLARE_EXPORT char * fnstsw(char * stream)
+{
+    return (char*)EncoderBase::encode(stream, Mnemonic_FNSTCW,
+                                      EncoderBase::Operands());
+}
+
+// string operations
+ENCODER_DECLARE_EXPORT char * set_d(char * stream, bool set) {
+    EncoderBase::Operands args;
+    return (char*)EncoderBase::encode(stream,
+                                      set ? Mnemonic_STD : Mnemonic_CLD,
+                                      args);
+}
+
+ENCODER_DECLARE_EXPORT char * scas(char * stream, unsigned char prefix)
+{
+	EncoderBase::Operands args;
+    if (prefix != no_prefix) {
+        assert(prefix == prefix_repnz || prefix == prefix_repz);
+        *stream = prefix;
+        ++stream;
+    }
+    return (char*)EncoderBase::encode(stream, Mnemonic_SCAS, args);
+}
+
+ENCODER_DECLARE_EXPORT char * stos(char * stream, unsigned char prefix)
+{
+    if (prefix != no_prefix) {
+        assert(prefix == prefix_rep);
+        *stream = prefix;
+        ++stream;
+    }
+
+	EncoderBase::Operands args;
+	return (char*)EncoderBase::encode(stream, Mnemonic_STOS, args);
+}
+
+// Intrinsic FP math functions
+
+ENCODER_DECLARE_EXPORT char * fprem(char * stream) {
+    return (char*)EncoderBase::encode(stream, Mnemonic_FPREM,
+                                      EncoderBase::Operands());
+}
+
+ENCODER_DECLARE_EXPORT char * fprem1(char * stream) {
+    return (char*)EncoderBase::encode(stream, Mnemonic_FPREM1,
+                                      EncoderBase::Operands());
+}
diff --git a/libpixelflinger/codeflinger/x86/load_store.cpp b/libpixelflinger/codeflinger/x86/load_store.cpp
new file mode 100644
index 0000000..a427411
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/load_store.cpp
@@ -0,0 +1,458 @@
+/* libs/pixelflinger/codeflinger/x86/load_store.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <cutils/log.h>
+
+#include "codeflinger/x86/GGLX86Assembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+void GGLX86Assembler::store(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc) {
+            MOV_REG_TO_MEM(s.reg, 0, addr.reg);
+            ADD_IMM_TO_REG(4, addr.reg);
+        } else {
+            MOV_REG_TO_MEM(s.reg, 0, addr.reg);
+        }
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // 0x00BBGGRR is unpacked as R,G,B
+        MOV_REG_TO_MEM(s.reg, 0, addr.reg, OpndSize_8);
+        ROR(8, s.reg);
+        MOV_REG_TO_MEM(s.reg, 1, addr.reg, OpndSize_8);
+        ROR(8, s.reg);
+        MOV_REG_TO_MEM(s.reg, 2, addr.reg, OpndSize_8);
+        if (!(s.flags & CORRUPTIBLE)) {
+            ROR(16, s.reg);
+        }
+        if (inc) {
+            ADD_IMM_TO_REG(3, addr.reg);
+        }
+        break;
+    case 16:
+        if (inc) {
+            MOV_REG_TO_MEM(s.reg, 0, addr.reg,OpndSize_16);
+            ADD_IMM_TO_REG(2, addr.reg);
+        } else {
+            MOV_REG_TO_MEM(s.reg, 0, addr.reg,OpndSize_16);
+        }
+        break;
+    case  8:
+        if (inc) {
+            MOV_REG_TO_MEM(s.reg, 0, addr.reg,OpndSize_8);
+            ADD_IMM_TO_REG(1, addr.reg);
+        } else {
+            MOV_REG_TO_MEM(s.reg, 0, addr.reg,OpndSize_8);
+        }
+        break;
+    }
+}
+
+void GGLX86Assembler::load(pointer_t& addr, const pixel_t& s, uint32_t flags)
+{
+    Scratch scratches(registerFile());
+    int s0;
+
+    const int bits = addr.size;
+    // WRITE_BACK indicates that the base register will also be updated after loading the data
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc) {
+            MOV_MEM_TO_REG(0, addr.reg, s.reg);
+            ADD_IMM_TO_REG(4, addr.reg);
+
+        } else        MOV_MEM_TO_REG(0, addr.reg, s.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // R,G,B is packed as 0x00BBGGRR
+        s0 = scratches.obtain();
+        if (s.reg != addr.reg) {
+            MOVZX_MEM_TO_REG(OpndSize_8, addr.reg, 0, s.reg); //R
+            MOVZX_MEM_TO_REG(OpndSize_8, addr.reg, 1, s0);   //G
+            SHL(8, s0);
+            OR_REG_TO_REG(s0, s.reg);
+            MOVZX_MEM_TO_REG(OpndSize_8, addr.reg, 2, s0); //B
+            SHL(16, s0);
+            OR_REG_TO_REG(s0, s.reg);
+        } else {
+            int s1 = scratches.obtain();
+            MOVZX_MEM_TO_REG(OpndSize_8, addr.reg, 0, s1); //R
+            MOVZX_MEM_TO_REG(OpndSize_8, addr.reg, 1, s0); //G
+            SHL(8, s0);
+            OR_REG_TO_REG(s0, s1);
+            MOVZX_MEM_TO_REG(OpndSize_8, addr.reg, 2, s0); //B
+            SHL(16, s0);
+            OR_REG_TO_REG(s0, s1);
+            MOV_REG_TO_REG(s1, s.reg);
+            scratches.recycle(s1);
+
+        }
+        scratches.recycle(s0);
+        if (inc)
+            ADD_IMM_TO_REG(3, addr.reg);
+        break;
+    case 16:
+        if (inc) {
+            MOVZX_MEM_TO_REG(OpndSize_16, addr.reg, 0, s.reg);
+            ADD_IMM_TO_REG(2, addr.reg);
+        }
+        else  MOVZX_MEM_TO_REG(OpndSize_16, addr.reg, 0, s.reg);
+        break;
+    case  8:
+        if (inc) {
+            MOVZX_MEM_TO_REG(OpndSize_8, addr.reg, 0, s.reg);
+            ADD_IMM_TO_REG(1, addr.reg);
+        }
+        else        MOVZX_MEM_TO_REG(OpndSize_8, addr.reg, 0, s.reg);
+        break;
+    }
+    if (inc) MOV_REG_TO_MEM(addr.reg, addr.offset_ebp, PhysicalReg_EBP);
+}
+
+void GGLX86Assembler::extract(integer_t& d, int s, int h, int l, int bits)
+{
+    const int maskLen = h-l;
+
+    assert(maskLen<=8);
+    assert(h);
+
+
+    if (h != bits) {
+        const int mask = ((1<<maskLen)-1) << l;
+        MOV_REG_TO_REG(s, d.reg);
+        AND_IMM_TO_REG(mask, d.reg);// component = packed & mask;
+        s = d.reg;
+    }
+
+    if (l) {
+        MOV_REG_TO_REG(s, d.reg);
+        SHR(l, d.reg);// component = packed >> l;
+        s = d.reg;
+    }
+
+    if (s != d.reg) {
+        MOV_REG_TO_REG(s, d.reg);
+    }
+
+    d.s = maskLen;
+}
+
+void GGLX86Assembler::extract(integer_t& d, const pixel_t& s, int component)
+{
+    extract(d,  s.reg,
+            s.format.c[component].h,
+            s.format.c[component].l,
+            s.size());
+}
+
+void GGLX86Assembler::extract(component_t& d, const pixel_t& s, int component)
+{
+    integer_t r(d.reg, 32, d.flags, d.offset_ebp);
+    extract(r,  s.reg,
+            s.format.c[component].h,
+            s.format.c[component].l,
+            s.size());
+    d = component_t(r);
+}
+
+
+void GGLX86Assembler::expand(integer_t& d, const component_t& s, int dbits)
+{
+    if (s.l || (s.flags & CLEAR_HI)) {
+        extract(d, s.reg, s.h, s.l, 32);
+        expand(d, d, dbits);
+    } else {
+        expand(d, integer_t(s.reg, s.size(), s.flags, s.offset_ebp), dbits);
+    }
+}
+
+void GGLX86Assembler::expand(component_t& d, const component_t& s, int dbits)
+{
+    integer_t r(d.reg, 32, d.flags, d.offset_ebp);
+    expand(r, s, dbits);
+    d = component_t(r);
+}
+
+void GGLX86Assembler::expand(integer_t& dst, const integer_t& src, int dbits)
+{
+    assert(src.size());
+
+    Scratch scratches(registerFile());
+    int sbits = src.size();
+    int s = src.reg;
+    int d = dst.reg;
+
+    // be sure to set 'dst' after we read 'src' as they may be identical
+    dst.s = dbits;
+    dst.flags = 0;
+
+    if (dbits<=sbits) {
+        if (s != d) {
+            MOV_REG_TO_REG(s, d);
+        }
+        return;
+    }
+
+    if (sbits == 1) {
+        MOV_REG_TO_REG(s, d);
+        SHL(dbits, d);
+        SUB_REG_TO_REG(s, d);
+        // d = (s<<dbits) - s;
+        return;
+    }
+
+    if (dbits % sbits) {
+        MOV_REG_TO_REG(s, d);
+        SHL(dbits-sbits, d);
+        // d = s << (dbits-sbits);
+        dbits -= sbits;
+        int temp = scratches.obtain();
+        do {
+            MOV_REG_TO_REG(d, temp);
+            SHR(sbits, temp);
+            OR_REG_TO_REG(temp, d);
+            // d |= d >> sbits;
+            dbits -= sbits;
+            sbits *= 2;
+        } while(dbits>0);
+        return;
+    }
+
+    dbits -= sbits;
+    do {
+        MOV_REG_TO_REG(s, d);
+        SHL(sbits, d);
+        OR_REG_TO_REG(s, d);
+        // d |= d<<sbits;
+        s = d;
+        dbits -= sbits;
+        if (sbits*2 < dbits) {
+            sbits *= 2;
+        }
+    } while(dbits>0);
+}
+
+void GGLX86Assembler::downshift(
+    pixel_t& d, int component, component_t s, reg_t& dither)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    Scratch scratches(registerFile());
+    // s(temp) is loaded in build_blending
+    s.reg = scratches.obtain();
+    MOV_MEM_TO_REG(s.offset_ebp, EBP, s.reg);
+
+    int sh = s.h;
+    int sl = s.l;
+    int maskHiBits = (sh!=32) ? ((s.flags & CLEAR_HI)?1:0) : 0;
+    int maskLoBits = (sl!=0)  ? ((s.flags & CLEAR_LO)?1:0) : 0;
+    int sbits = sh - sl;
+
+    int dh = d.format.c[component].h;
+    int dl = d.format.c[component].l;
+    int dbits = dh - dl;
+    int dithering = 0;
+
+    ALOGE_IF(sbits<dbits, "sbits (%d) < dbits (%d) in downshift", sbits, dbits);
+
+    if (sbits>dbits) {
+        // see if we need to dither
+        dithering = mDithering;
+    }
+
+    int ireg = d.reg;
+    if (!(d.flags & FIRST)) {
+        if (s.flags & CORRUPTIBLE)  {
+            ireg = s.reg;
+        } else {
+            ireg = scratches.obtain();
+        }
+    }
+    d.flags &= ~FIRST;
+
+    if (maskHiBits) {
+        // we need to mask the high bits (and possibly the lowbits too)
+        // and we might be able to use immediate mask.
+        if (!dithering) {
+            // we don't do this if we only have maskLoBits because we can
+            // do it more efficiently below (in the case where dl=0)
+            const int offset = sh - dbits;
+            if (dbits<=8 && offset >= 0) {
+                const uint32_t mask = ((1<<dbits)-1) << offset;
+                build_and_immediate(ireg, s.reg, mask, 32);
+                s.reg = ireg;
+                sl = offset;
+                sbits = dbits;
+                maskLoBits = maskHiBits = 0;
+            }
+        } else {
+            // in the dithering case though, we need to preserve the lower bits
+            const uint32_t mask = ((1<<sbits)-1) << sl;
+            build_and_immediate(ireg, s.reg, mask, 32);
+            s.reg = ireg;
+            maskLoBits = maskHiBits = 0;
+        }
+    }
+
+    // XXX: we could special case (maskHiBits & !maskLoBits)
+    // like we do for maskLoBits below, but it happens very rarely
+    // that we have maskHiBits only and the conditions necessary to lead
+    // to better code (like doing d |= s << 24)
+
+    if (maskHiBits) {
+        MOV_REG_TO_REG(s.reg, ireg);
+        SHL(32-sh, ireg);
+        sl += 32-sh;
+        sh = 32;
+        s.reg = ireg;
+        maskHiBits = 0;
+    }
+
+    //  Downsampling should be performed as follows:
+    //  V * ((1<<dbits)-1) / ((1<<sbits)-1)
+    //  V * [(1<<dbits)/((1<<sbits)-1) - 1/((1<<sbits)-1)]
+    //  V * [1/((1<<sbits)-1)>>dbits - 1/((1<<sbits)-1)]
+    //  V/((1<<(sbits-dbits))-(1>>dbits)) - (V>>sbits)/((1<<sbits)-1)>>sbits
+    //  V/((1<<(sbits-dbits))-(1>>dbits)) - (V>>sbits)/(1-(1>>sbits))
+    //
+    //  By approximating (1>>dbits) and (1>>sbits) to 0:
+    //
+    //  V>>(sbits-dbits) - V>>sbits
+    //
+    //  A good approximation is V>>(sbits-dbits),
+    //  but better one (needed for dithering) is:
+    //
+    //  (V>>(sbits-dbits)<<sbits - V)>>sbits
+    //  (V<<dbits - V)>>sbits
+    //  (V - V>>dbits)>>(sbits-dbits)
+
+    // Dithering is done here
+    if (dithering) {
+        comment("dithering");
+        if (sl) {
+            MOV_REG_TO_REG(s.reg, ireg);
+            SHR(sl, ireg);
+            sh -= sl;
+            sl = 0;
+            s.reg = ireg;
+        }
+        // scaling (V-V>>dbits)
+        int temp_reg = scratches.obtain();
+        MOV_REG_TO_REG(s.reg, temp_reg);
+        SHR(dbits, temp_reg);
+        MOV_REG_TO_REG(s.reg, ireg);
+        SUB_REG_TO_REG(temp_reg, ireg);
+        scratches.recycle(temp_reg);
+        const int shift = (GGL_DITHER_BITS - (sbits-dbits));
+        dither.reg = scratches.obtain();
+        MOV_MEM_TO_REG(dither.offset_ebp, EBP, dither.reg);
+        if (shift>0)  {
+            temp_reg = scratches.obtain();
+            MOV_REG_TO_REG(dither.reg, temp_reg);
+            SHR(shift, temp_reg);
+            ADD_REG_TO_REG(temp_reg, ireg);
+            scratches.recycle(temp_reg);
+        }
+        else if (shift<0) {
+            temp_reg = scratches.obtain();
+            MOV_REG_TO_REG(dither.reg, temp_reg);
+            SHL(-shift, temp_reg);
+            ADD_REG_TO_REG(temp_reg, ireg);
+            scratches.recycle(temp_reg);
+        }
+        else {
+            ADD_REG_TO_REG(dither.reg, ireg);
+        }
+        scratches.recycle(dither.reg);
+        s.reg = ireg;
+    }
+
+    if ((maskLoBits|dithering) && (sh > dbits)) {
+        int shift = sh-dbits;
+        if (dl) {
+            MOV_REG_TO_REG(s.reg, ireg);
+            SHR(shift, ireg);
+            if (ireg == d.reg) {
+                MOV_REG_TO_REG(ireg, d.reg);
+                SHL(dl, d.reg);
+            } else {
+                int temp_reg = scratches.obtain();
+                MOV_REG_TO_REG(ireg, temp_reg);
+                SHL(dl, temp_reg);
+                OR_REG_TO_REG(temp_reg, d.reg);
+                scratches.recycle(temp_reg);
+            }
+        } else {
+            if (ireg == d.reg) {
+                MOV_REG_TO_REG(s.reg, d.reg);
+                SHR(shift, d.reg);
+            } else {
+                int temp_reg = scratches.obtain();
+                MOV_REG_TO_REG(s.reg, temp_reg);
+                SHR(shift, temp_reg);
+                OR_REG_TO_REG(temp_reg, d.reg);
+                scratches.recycle(temp_reg);
+            }
+        }
+    } else {
+        int shift = sh-dh;
+        if (shift>0) {
+            if (ireg == d.reg) {
+                MOV_REG_TO_REG(s.reg, d.reg);
+                SHR(shift, d.reg);
+            } else {
+                int temp_reg = scratches.obtain();
+                MOV_REG_TO_REG(s.reg, temp_reg);
+                SHR(shift, temp_reg);
+                OR_REG_TO_REG(temp_reg, d.reg);
+                scratches.recycle(temp_reg);
+            }
+        } else if (shift<0) {
+            if (ireg == d.reg) {
+                MOV_REG_TO_REG(s.reg, d.reg);
+                SHL(-shift, d.reg);
+            } else {
+                int temp_reg = scratches.obtain();
+                MOV_REG_TO_REG(s.reg, temp_reg);
+                SHL(-shift, temp_reg);
+                OR_REG_TO_REG(temp_reg, d.reg);
+                scratches.recycle(temp_reg);
+            }
+        } else {
+            if (ireg == d.reg) {
+                if (s.reg != d.reg) {
+                    MOV_REG_TO_REG(s.reg, d.reg);
+                }
+            } else {
+                OR_REG_TO_REG(s.reg, d.reg);
+            }
+        }
+    }
+}
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/x86/texturing.cpp b/libpixelflinger/codeflinger/x86/texturing.cpp
new file mode 100644
index 0000000..c02f12b
--- /dev/null
+++ b/libpixelflinger/codeflinger/x86/texturing.cpp
@@ -0,0 +1,1799 @@
+/* libs/pixelflinger/codeflinger/x86/texturing.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#include <cutils/log.h>
+
+#include "codeflinger/x86/GGLX86Assembler.h"
+
+
+namespace android {
+
+// ---------------------------------------------------------------------------
+
+// iterators are initialized like this:
+// (intToFixedCenter(x) * dx)>>16 + x0
+// ((x<<16 + 0x8000) * dx)>>16 + x0
+// ((x<<16)*dx + (0x8000*dx))>>16 + x0
+// ( (x*dx) + dx>>1 ) + x0
+// (x*dx) + (dx>>1 + x0)
+
+void GGLX86Assembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
+{
+    context_t const* c = mBuilderContext.c;
+    const needs_t& needs = mBuilderContext.needs;
+    int temp_reg;
+
+    if (mSmooth) {
+        // NOTE: we could take this case in the mDithering + !mSmooth case,
+        // but this would use up to 4 more registers for the color components
+        // for only a little added quality.
+        // Currently, this causes the system to run out of registers in
+        // some case (see issue #719496)
+
+        comment("compute initial iterated color (smooth and/or dither case)");
+
+        parts.iterated_packed = 0;
+        parts.packed = 0;
+
+        // 0x1: color component
+        // 0x2: iterators
+        //parts.reload = 3;
+        const int optReload = mOptLevel >> 1;
+        if (optReload >= 3)         parts.reload = 0; // reload nothing
+        else if (optReload == 2)    parts.reload = 2; // reload iterators
+        else if (optReload == 1)    parts.reload = 1; // reload colors
+        else if (optReload <= 0)    parts.reload = 3; // reload both
+
+        if (!mSmooth) {
+            // we're not smoothing (just dithering), we never have to
+            // reload the iterators
+            parts.reload &= ~2;
+        }
+
+        Scratch scratches(registerFile());
+        const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
+        const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
+        for (int i=0 ; i<4 ; i++) {
+            if (!mInfo[i].iterated)
+                continue;
+            // this component exists in the destination and is not replaced
+            // by a texture unit.
+            const int c = (parts.reload & 1) ? t0 : obtainReg();
+            if (i==0) CONTEXT_LOAD(c, iterators.ydady);
+            if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
+            if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
+            if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
+            parts.argb[i].reg = c;
+
+            if (mInfo[i].smooth) {
+                parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
+                const int dvdx = parts.argb_dx[i].reg;
+                temp_reg = scratches.obtain();
+                CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
+                MOV_REG_TO_REG(dvdx, temp_reg);
+                IMUL(x.reg, temp_reg);
+                ADD_REG_TO_REG(temp_reg, c);
+                scratches.recycle(temp_reg);
+
+                // adjust the color iterator to make sure it won't overflow
+                if (!mAA) {
+                    // this is not needed when we're using anti-aliasing
+                    // because we will (have to) clamp the components
+                    // anyway.
+                    int end = scratches.obtain();
+                    MOV_MEM_TO_REG(parts.count.offset_ebp, PhysicalReg_EBP, end);
+                    SHR(16, end);
+                    IMUL(end, dvdx);
+                    temp_reg = end;
+                    // c - (dvdx*end + c) = -(dvdx*end)
+                    MOV_REG_TO_REG(dvdx, temp_reg);
+                    NEG(temp_reg);
+                    ADD_REG_TO_REG(c, dvdx);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVS, temp_reg, c);
+                    /*
+                                        SUB_REG_TO_REG(dvdx, temp_reg);
+                                        switch(i) {
+                                        case 0:
+                                            JCC(Mnemonic_JNS, "1f_init_iterated_color");
+                                            SUB_REG_TO_REG(dvdx, c);
+                                            label("1f_init_iterated_color");
+                                            break;
+                                        case 1:
+                                            JCC(Mnemonic_JNS, "2f_init_iterated_color");
+                                            SUB_REG_TO_REG(dvdx, c);
+                                            label("2f_init_iterated_color");
+                                            break;
+                                        case 2:
+                                            JCC(Mnemonic_JNS, "3f_init_iterated_color");
+                                            SUB_REG_TO_REG(dvdx, c);
+                                            label("3f_init_iterated_color");
+                                            break;
+                                        case 3:
+                                            JCC(Mnemonic_JNS, "4f_init_iterated_color");
+                                            SUB_REG_TO_REG(dvdx, c);
+                                            label("4f_init_iterated_color");
+                                            break;
+                                        }
+                    */
+
+                    MOV_REG_TO_REG(c, temp_reg);
+                    SAR(31, temp_reg);
+                    NOT(temp_reg);
+                    AND_REG_TO_REG(temp_reg, c);
+                    scratches.recycle(end);
+                }
+                if(parts.reload & 2)
+                    scratches.recycle(dvdx);
+                else
+                    recycleReg(dvdx);
+            }
+            CONTEXT_STORE(c, generated_vars.argb[i].c);
+            if(parts.reload & 1)
+                scratches.recycle(parts.argb[i].reg);
+            else
+                recycleReg(parts.argb[i].reg);
+
+            parts.argb[i].reg = -1;
+            //if (parts.reload & 1) {
+            //    //MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+            //}
+        }
+    } else {
+        // We're not smoothed, so we can
+        // just use a packed version of the color and extract the
+        // components as needed (or not at all if we don't blend)
+
+        // figure out if we need the iterated color
+        int load = 0;
+        for (int i=0 ; i<4 ; i++) {
+            component_info_t& info = mInfo[i];
+            if ((info.inDest || info.needed) && !info.replaced)
+                load |= 1;
+        }
+
+        parts.iterated_packed = 1;
+        parts.packed = (!mTextureMachine.mask && !mBlending
+                        && !mFog && !mDithering);
+        parts.reload = 0;
+        if (load || parts.packed) {
+            if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
+                comment("load initial iterated color (8888 packed)");
+                parts.iterated.setTo(obtainReg(),
+                                     &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
+                CONTEXT_LOAD(parts.iterated.reg, packed8888);
+            } else {
+                comment("load initial iterated color (dest format packed)");
+
+                parts.iterated.setTo(obtainReg(), &mCbFormat);
+
+                // pre-mask the iterated color
+                const int bits = parts.iterated.size();
+                const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
+                uint32_t mask = 0;
+                if (mMasking) {
+                    for (int i=0 ; i<4 ; i++) {
+                        const int component_mask = 1<<i;
+                        const int h = parts.iterated.format.c[i].h;
+                        const int l = parts.iterated.format.c[i].l;
+                        if (h && (!(mMasking & component_mask))) {
+                            mask |= ((1<<(h-l))-1) << l;
+                        }
+                    }
+                }
+
+                if (mMasking && ((mask & size)==0)) {
+                    // none of the components are present in the mask
+                } else {
+                    CONTEXT_LOAD(parts.iterated.reg, packed);
+                    if (mCbFormat.size == 1) {
+                        int imm = 0xFF;
+                        AND_IMM_TO_REG(imm, parts.iterated.reg);
+                    } else if (mCbFormat.size == 2) {
+                        SHR(16, parts.iterated.reg);
+                    }
+                }
+
+                // pre-mask the iterated color
+                if (mMasking) {
+                    //AND_IMM_TO_REG(mask, parts.iterated.reg);
+                    build_and_immediate(parts.iterated.reg, parts.iterated.reg,
+                                        mask, bits);
+                }
+            }
+            mCurSp = mCurSp - 4;
+            parts.iterated.offset_ebp = mCurSp;
+            MOV_REG_TO_MEM(parts.iterated.reg, parts.iterated.offset_ebp, EBP);
+            //PUSH(parts.iterated.reg);
+            recycleReg(parts.iterated.reg);
+            parts.iterated.reg=-1;
+        }
+    }
+}
+
+void GGLX86Assembler::build_iterated_color(
+    component_t& fragment,
+    fragment_parts_t& parts,
+    int component,
+    Scratch& regs)
+{
+
+    if (!mInfo[component].iterated)
+        return;
+
+    if (parts.iterated_packed) {
+        // iterated colors are packed, extract the one we need
+        parts.iterated.reg = regs.obtain();
+        MOV_MEM_TO_REG(parts.iterated.offset_ebp, EBP, parts.iterated.reg);
+        extract(fragment, parts.iterated, component);
+        regs.recycle(parts.iterated.reg);
+    } else {
+        fragment.h = GGL_COLOR_BITS;
+        fragment.l = GGL_COLOR_BITS - 8;
+        fragment.flags |= CLEAR_LO;
+        // iterated colors are held in their own register,
+        // (smooth and/or dithering case)
+        Scratch scratches(registerFile());
+        mBuilderContext.Rctx = scratches.obtain();
+        MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+        if (parts.reload==3) {
+            // this implies mSmooth
+            int dx = scratches.obtain();
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+            CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
+            ADD_REG_TO_REG(fragment.reg, dx);
+            CONTEXT_STORE(dx, generated_vars.argb[component].c);
+            scratches.recycle(dx);
+        } else if (parts.reload & 1) {
+            //MOV_MEM_TO_REG(parts.argb[component].offset_ebp, EBP, fragment.reg);
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+        } else {
+            // we don't reload, so simply rename the register and mark as
+            // non CORRUPTIBLE so that the texture env or blending code
+            // won't modify this (renamed) register
+            //regs.recycle(fragment.reg);
+            //MOV_MEM_TO_REG(parts.argb[component].offset_ebp, EBP, fragment.reg);
+            // it will also be used in build_smooth_shade
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+            //fragment.reg = parts.argb[component].reg;
+            //fragment.flags &= ~CORRUPTIBLE;
+        }
+        scratches.recycle(mBuilderContext.Rctx);
+        if (mInfo[component].smooth && mAA) {
+            // when using smooth shading AND anti-aliasing, we need to clamp
+            // the iterators because there is always an extra pixel on the
+            // edges, which most of the time will cause an overflow
+            // (since technically its outside of the domain).
+            int temp = scratches.obtain();
+            MOV_REG_TO_REG(fragment.reg, temp);
+            SAR(31, temp);
+            NOT(temp);
+            OR_REG_TO_REG(temp, fragment.reg);
+            component_sat(fragment, temp);
+            scratches.recycle(temp);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::decodeLogicOpNeeds(const needs_t& needs)
+{
+    // gather some informations about the components we need to process...
+    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+    switch(opcode) {
+    case GGL_COPY:
+        mLogicOp = 0;
+        break;
+    case GGL_CLEAR:
+    case GGL_SET:
+        mLogicOp = LOGIC_OP;
+        break;
+    case GGL_AND:
+    case GGL_AND_REVERSE:
+    case GGL_AND_INVERTED:
+    case GGL_XOR:
+    case GGL_OR:
+    case GGL_NOR:
+    case GGL_EQUIV:
+    case GGL_OR_REVERSE:
+    case GGL_OR_INVERTED:
+    case GGL_NAND:
+        mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
+        break;
+    case GGL_NOOP:
+    case GGL_INVERT:
+        mLogicOp = LOGIC_OP|LOGIC_OP_DST;
+        break;
+    case GGL_COPY_INVERTED:
+        mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
+        break;
+    };
+}
+
+void GGLX86Assembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
+{
+    uint8_t replaced=0;
+    mTextureMachine.mask = 0;
+    mTextureMachine.activeUnits = 0;
+    for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (replaced == 0xF) {
+            // all components are replaced, skip this TMU.
+            tmu.format_idx = 0;
+            tmu.mask = 0;
+            tmu.replaced = replaced;
+            continue;
+        }
+        tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
+        tmu.format = c->formats[tmu.format_idx];
+        tmu.bits = tmu.format.size*8;
+        tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
+        tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
+        tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
+        tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
+        tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
+                     && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
+
+        // 5551 linear filtering is not supported
+        if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
+            tmu.linear = 0;
+
+        tmu.mask = 0;
+        tmu.replaced = replaced;
+
+        if (tmu.format_idx) {
+            mTextureMachine.activeUnits++;
+            if (tmu.format.c[0].h)    tmu.mask |= 0x1;
+            if (tmu.format.c[1].h)    tmu.mask |= 0x2;
+            if (tmu.format.c[2].h)    tmu.mask |= 0x4;
+            if (tmu.format.c[3].h)    tmu.mask |= 0x8;
+            if (tmu.env == GGL_REPLACE) {
+                replaced |= tmu.mask;
+            } else if (tmu.env == GGL_DECAL) {
+                if (!tmu.format.c[GGLFormat::ALPHA].h) {
+                    // if we don't have alpha, decal does nothing
+                    tmu.mask = 0;
+                } else {
+                    // decal always ignores At
+                    tmu.mask &= ~(1<<GGLFormat::ALPHA);
+                }
+            }
+        }
+        mTextureMachine.mask |= tmu.mask;
+        ////printf("%d: mask=%08lx, replaced=%08lx\n",
+        //    i, int(tmu.mask), int(tmu.replaced));
+    }
+    mTextureMachine.replaced = replaced;
+    mTextureMachine.directTexture = 0;
+    ////printf("replaced=%08lx\n", mTextureMachine.replaced);
+}
+
+
+void GGLX86Assembler::init_textures(
+    tex_coord_t* coords,
+    const reg_t& x, const reg_t& y)
+{
+    context_t const* c = mBuilderContext.c;
+    const needs_t& needs = mBuilderContext.needs;
+    reg_t temp_reg_t;
+    int Rx = x.reg;
+    int Ry = y.reg;
+
+    if (mTextureMachine.mask) {
+        comment("compute texture coordinates");
+    }
+
+    // init texture coordinates for each tmu
+    const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+                (tmu.twrap == GGL_NEEDS_WRAP_11))
+        {
+            Scratch scratches(registerFile());
+            // 1:1 texture
+            pointer_t& txPtr = coords[i].ptr;
+            txPtr.setTo(obtainReg(), tmu.bits);
+            CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
+            SAR(16, txPtr.reg);
+            ADD_REG_TO_REG(txPtr.reg, Rx);
+            CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
+            SAR(16, txPtr.reg);
+            ADD_REG_TO_REG(txPtr.reg, Ry);
+            // Rx and Ry are changed
+            // Rx = Rx + ti.iterators.ydsdy>>16
+            // Ry = Ry + ti.iterators.ydtdy>>16
+            // Rx = Ry * ti.stide + Rx
+
+            // merge base & offset
+            CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
+            IMUL(Ry, txPtr.reg);
+            ADD_REG_TO_REG(txPtr.reg, Rx);
+
+            CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data);
+            temp_reg_t.setTo(Rx);
+            base_offset(txPtr, txPtr, temp_reg_t);
+            //PUSH(txPtr.reg);
+            mCurSp = mCurSp - 4;
+            txPtr.offset_ebp = mCurSp; //ebx, esi, edi, parts.count.reg, parts.cbPtr.reg, parts.z.reg
+            MOV_REG_TO_MEM(txPtr.reg, txPtr.offset_ebp, EBP);
+            recycleReg(txPtr.reg);
+            txPtr.reg=-1;
+        } else {
+            Scratch scratches(registerFile());
+            reg_t& s = coords[i].s;
+            reg_t& t = coords[i].t;
+            // s = (x * dsdx)>>16 + ydsdy
+            // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
+            // t = (x * dtdx)>>16 + ydtdy
+            // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
+            const int need_w = GGL_READ_NEEDS(W, needs.n);
+            MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+            if (need_w) {
+                s.setTo(obtainReg());
+                t.setTo(obtainReg());
+                CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
+                CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
+                CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
+                recycleReg(s.reg);
+                recycleReg(t.reg);
+            } else {
+                int ydsdy = scratches.obtain();
+                int dsdx = scratches.obtain();
+                CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
+                CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+                IMUL(Rx, dsdx);
+                ADD_REG_TO_REG(dsdx, ydsdy);
+                CONTEXT_STORE(ydsdy, generated_vars.texture[i].spill[0]);
+                scratches.recycle(ydsdy);
+                scratches.recycle(dsdx);
+
+                int ydtdy = scratches.obtain();
+                int dtdx = scratches.obtain();
+                CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
+                CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+                IMUL(Rx, dtdx);
+                ADD_REG_TO_REG(dtdx, ydtdy);
+                CONTEXT_STORE(ydtdy, generated_vars.texture[i].spill[1]);
+                scratches.recycle(ydtdy);
+                scratches.recycle(dtdx);
+
+                // s.reg = Rx * ti.dsdx + ydsdy
+                // t.reg = Rx * ti.dtdx + ydtdy
+            }
+        }
+
+        // direct texture?
+        if (!multiTexture && !mBlending && !mDithering && !mFog &&
+                cb_format_idx == tmu.format_idx && !tmu.linear &&
+                mTextureMachine.replaced == tmu.mask)
+        {
+            mTextureMachine.directTexture = i + 1;
+        }
+    }
+}
+
+void GGLX86Assembler::build_textures(  fragment_parts_t& parts,
+                                       Scratch& regs)
+{
+    context_t const* c = mBuilderContext.c;
+    const needs_t& needs = mBuilderContext.needs;
+    reg_t temp_reg_t;
+    //int Rctx = mBuilderContext.Rctx;
+
+
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+
+        pointer_t& txPtr = parts.coords[i].ptr;
+        pixel_t& texel = parts.texel[i];
+
+        // repeat...
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+                (tmu.twrap == GGL_NEEDS_WRAP_11))
+        {   // 1:1 textures
+            comment("fetch texel");
+            texel.setTo(regs.obtain(), &tmu.format);
+            txPtr.reg = regs.obtain();
+            MOV_MEM_TO_REG(txPtr.offset_ebp, EBP, txPtr.reg);
+            mCurSp = mCurSp - 4;
+            texel.offset_ebp = mCurSp;
+            load(txPtr, texel, WRITE_BACK);
+            MOV_REG_TO_MEM(texel.reg, texel.offset_ebp, EBP);
+            regs.recycle(texel.reg);
+            regs.recycle(txPtr.reg);
+        } else {
+            Scratch scratches(registerFile());
+            reg_t& s = parts.coords[i].s;
+            reg_t& t = parts.coords[i].t;
+            comment("reload s/t (multitexture or linear filtering)");
+            s.reg = scratches.obtain();
+            t.reg = scratches.obtain();
+            mBuilderContext.Rctx = scratches.obtain();
+            MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+            CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
+            CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
+
+            comment("compute repeat/clamp");
+            int width   = scratches.obtain();
+            int height  = scratches.obtain();
+            int U = 0;
+            int V = 0;
+            // U and V will be stored onto the stack due to the limited register
+            reg_t reg_U, reg_V;
+
+            CONTEXT_LOAD(width,  generated_vars.texture[i].width);
+            CONTEXT_LOAD(height, generated_vars.texture[i].height);
+            scratches.recycle(mBuilderContext.Rctx);
+
+            int FRAC_BITS = 0;
+            if (tmu.linear) {
+                // linear interpolation
+                if (tmu.format.size == 1) {
+                    // for 8-bits textures, we can afford
+                    // 7 bits of fractional precision at no
+                    // additional cost (we can't do 8 bits
+                    // because filter8 uses signed 16 bits muls)
+                    FRAC_BITS = 7;
+                } else if (tmu.format.size == 2) {
+                    // filter16() is internally limited to 4 bits, so:
+                    // FRAC_BITS=2 generates less instructions,
+                    // FRAC_BITS=3,4,5 creates unpleasant artifacts,
+                    // FRAC_BITS=6+ looks good
+                    FRAC_BITS = 6;
+                } else if (tmu.format.size == 4) {
+                    // filter32() is internally limited to 8 bits, so:
+                    // FRAC_BITS=4 looks good
+                    // FRAC_BITS=5+ looks better, but generates 3 extra ipp
+                    FRAC_BITS = 6;
+                } else {
+                    // for all other cases we use 4 bits.
+                    FRAC_BITS = 4;
+                }
+            }
+            int u       = scratches.obtain();
+            // s.reg and t.reg are recycled in wrapping
+            wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS, scratches);
+            int v       = scratches.obtain();
+            wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS, scratches);
+
+
+            if (tmu.linear) {
+
+                //mBuilderContext.Rctx = scratches.obtain();
+                //MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+                //CONTEXT_LOAD(width,  generated_vars.texture[i].width);
+                //CONTEXT_LOAD(height, generated_vars.texture[i].height);
+                //scratches.recycle(mBuilderContext.Rctx);
+
+                comment("compute linear filtering offsets");
+                // pixel size scale
+                const int shift = 31 - gglClz(tmu.format.size);
+                U = scratches.obtain();
+                V = scratches.obtain();
+
+
+                // sample the texel center
+                SUB_IMM_TO_REG(1<<(FRAC_BITS-1), u);
+                SUB_IMM_TO_REG(1<<(FRAC_BITS-1), v);
+
+                // get the fractionnal part of U,V
+                MOV_REG_TO_REG(u, U);
+                AND_IMM_TO_REG((1<<FRAC_BITS)-1, U);
+                MOV_REG_TO_REG(v, V);
+                AND_IMM_TO_REG((1<<FRAC_BITS)-1, V);
+
+                // below we will pop U and V in the filter function
+                mCurSp = mCurSp - 4;
+                MOV_REG_TO_MEM(U, mCurSp, EBP);
+                reg_U.offset_ebp = mCurSp;
+                mCurSp = mCurSp - 4;
+                MOV_REG_TO_MEM(V, mCurSp, EBP);
+                reg_V.offset_ebp = mCurSp;
+
+                scratches.recycle(U);
+                scratches.recycle(V);
+
+                // compute width-1 and height-1
+                SUB_IMM_TO_REG(1, width);
+                SUB_IMM_TO_REG(1, height);
+
+                // the registers are used up
+                int temp1 = scratches.obtain();
+                int temp2 = scratches.obtain();
+                // get the integer part of U,V and clamp/wrap
+                // and compute offset to the next texel
+                if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
+                    // u has already been REPEATed
+                    SAR(FRAC_BITS, u);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVS, width, u);
+                    MOV_IMM_TO_REG(1<<shift, temp1);
+                    MOV_REG_TO_REG(width, temp2);
+                    // SHL may pollute the CF flag
+                    SHL(shift, temp2);
+                    mCurSp = mCurSp - 4;
+                    int width_offset_ebp = mCurSp;
+                    // width will be changed after the first comparison
+                    MOV_REG_TO_MEM(width, width_offset_ebp, EBP);
+                    CMP_REG_TO_REG(width, u);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVL, temp1, width);
+                    if (shift) {
+                        CMOV_REG_TO_REG(Mnemonic_CMOVGE, temp2, width);
+                    }
+                    MOV_REG_TO_REG(width, temp1);
+                    NEG(temp1);
+                    // width is actually changed
+                    CMP_MEM_TO_REG(EBP, width_offset_ebp, u);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVGE, temp1, width);
+                } else {
+                    // u has not been CLAMPed yet
+                    // algorithm:
+                    // if ((u>>4) >= width)
+                    //      u = width<<4
+                    //      width = 0
+                    // else
+                    //      width = 1<<shift
+                    // u = u>>4; // get integer part
+                    // if (u<0)
+                    //      u = 0
+                    //      width = 0
+                    // generated_vars.rt = width
+
+                    MOV_REG_TO_REG(width, temp2);
+                    SHL(FRAC_BITS, temp2);
+                    MOV_REG_TO_REG(u, temp1);
+                    SAR(FRAC_BITS, temp1);
+                    CMP_REG_TO_REG(temp1, width);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVLE, temp2, u);
+                    // mov doesn't affect the flags
+                    MOV_IMM_TO_REG(0, temp2);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVLE, temp2, width);
+                    MOV_IMM_TO_REG(1 << shift, temp2);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVG, temp2, width);
+
+                    MOV_IMM_TO_REG(0, temp2);
+                    SAR(FRAC_BITS, u);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVS, temp2, u);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVS, temp2, width);
+                }
+                scratches.recycle(temp1);
+                scratches.recycle(temp2);
+                mBuilderContext.Rctx = scratches.obtain();
+                MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+                CONTEXT_STORE(width, generated_vars.rt);
+
+                const int stride = width;
+                CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
+                scratches.recycle(mBuilderContext.Rctx);
+
+                temp1 = scratches.obtain();
+                temp2 = scratches.obtain();
+
+                int height_offset_ebp;
+                if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
+                    // v has already been REPEATed
+                    SAR(FRAC_BITS, v);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVS, height, v);
+                    MOV_IMM_TO_REG(1<<shift, temp1);
+                    MOV_REG_TO_REG(height, temp2);
+                    SHL(shift, temp2);
+                    mCurSp = mCurSp - 4;
+                    height_offset_ebp = mCurSp;
+                    // height will be changed after the first comparison
+                    MOV_REG_TO_MEM(height, height_offset_ebp, EBP);
+                    CMP_REG_TO_REG(height, v);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVL, temp1, height);
+                    if (shift) {
+                        CMOV_REG_TO_REG(Mnemonic_CMOVGE, temp2, height);
+                    }
+                    MOV_REG_TO_REG(height, temp1);
+                    NEG(temp1);
+                    // height is actually changed
+                    CMP_MEM_TO_REG(EBP, height_offset_ebp, v);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVGE, temp1, height);
+                    IMUL(stride, height);
+                } else {
+                    // u has not been CLAMPed yet
+                    MOV_REG_TO_REG(height, temp2);
+                    SHL(FRAC_BITS, temp2);
+                    MOV_REG_TO_REG(v, temp1);
+                    SAR(FRAC_BITS, temp1);
+
+                    mCurSp = mCurSp - 4;
+                    height_offset_ebp = mCurSp;
+                    // height may be changed after the first comparison
+                    MOV_REG_TO_MEM(height, height_offset_ebp, EBP);
+
+                    CMP_REG_TO_REG(temp1, height);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVLE, temp2, v);
+                    MOV_IMM_TO_REG(0, temp2);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVLE, temp2, height);
+
+                    if (shift) {
+                        // stride = width. It's not used
+                        // shift may pollute the flags
+                        SHL(shift, stride);
+                        // height may be changed to 0
+                        CMP_REG_TO_MEM(temp1, height_offset_ebp, EBP);
+                        CMOV_REG_TO_REG(Mnemonic_CMOVG, stride, height);
+                    } else {
+                        CMOV_REG_TO_REG(Mnemonic_CMOVG, stride, height);
+                    }
+                    MOV_IMM_TO_REG(0, temp2);
+                    SAR(FRAC_BITS, v);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVS, temp2, v);
+                    CMOV_REG_TO_REG(Mnemonic_CMOVS, temp2, height);
+                }
+                scratches.recycle(temp1);
+                scratches.recycle(temp2);
+                mBuilderContext.Rctx = scratches.obtain();
+                MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+                CONTEXT_STORE(height, generated_vars.lb);
+                scratches.recycle(mBuilderContext.Rctx);
+            }
+
+            scratches.recycle(width);
+            scratches.recycle(height);
+
+            // iterate texture coordinates...
+            comment("iterate s,t");
+            int dsdx = scratches.obtain();
+            s.reg = scratches.obtain();
+            mBuilderContext.Rctx = scratches.obtain();
+            MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+            CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+            CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
+            ADD_REG_TO_REG(dsdx, s.reg);
+            CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
+            scratches.recycle(s.reg);
+            scratches.recycle(dsdx);
+            int dtdx = scratches.obtain();
+            t.reg = scratches.obtain();
+            CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+            CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
+            ADD_REG_TO_REG(dtdx, t.reg);
+            CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
+            scratches.recycle(dtdx);
+            scratches.recycle(t.reg);
+
+            // merge base & offset...
+            comment("merge base & offset");
+            texel.setTo(scratches.obtain(), &tmu.format);
+            //txPtr.setTo(texel.reg, tmu.bits);
+            txPtr.setTo(scratches.obtain(), tmu.bits);
+            int stride = scratches.obtain();
+            CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
+            CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data);
+            scratches.recycle(mBuilderContext.Rctx);
+            MOVSX_REG_TO_REG(OpndSize_16, v, v);
+            MOVSX_REG_TO_REG(OpndSize_16, stride, stride);
+            IMUL(v, stride);
+            ADD_REG_TO_REG(stride, u);// u+v*stride
+            temp_reg_t.setTo(u);
+            base_offset(txPtr, txPtr, temp_reg_t);
+
+            // recycle registers we don't need anymore
+            scratches.recycle(u);
+            scratches.recycle(v);
+            scratches.recycle(stride);
+
+            mCurSp = mCurSp - 4;
+            texel.offset_ebp = mCurSp;
+            // load texel
+            if (!tmu.linear) {
+                comment("fetch texel in building texture");
+                load(txPtr, texel, 0);
+                MOV_REG_TO_MEM(texel.reg, texel.offset_ebp, EBP);
+                scratches.recycle(texel.reg);
+                scratches.recycle(txPtr.reg);
+            } else {
+                comment("fetch texel, bilinear");
+                // the registes are not enough. We spill texel and previous U and V
+                // texel.reg is recycled in the following functions since there are more than one code path
+                switch (tmu.format.size) {
+                case 1:
+                    filter8(parts, texel, tmu, reg_U, reg_V, txPtr, FRAC_BITS, scratches);
+                    break;
+                case 2:
+                    filter16(parts, texel, tmu, reg_U, reg_V, txPtr, FRAC_BITS, scratches);
+                    break;
+                case 3:
+                    filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS);
+                    break;
+                case 4:
+                    filter32(parts, texel, tmu, reg_U, reg_V, txPtr, FRAC_BITS, scratches);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+void GGLX86Assembler::build_iterate_texture_coordinates(
+    const fragment_parts_t& parts)
+{
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+                (tmu.twrap == GGL_NEEDS_WRAP_11))
+        {   // 1:1 textures
+            const pointer_t& txPtr = parts.coords[i].ptr;
+            ADD_IMM_TO_MEM(txPtr.size>>3, txPtr.offset_ebp, EBP);
+        } else {
+            Scratch scratches(registerFile());
+            int s = parts.coords[i].s.reg;
+            int t = parts.coords[i].t.reg;
+            mBuilderContext.Rctx = scratches.obtain();
+            MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+            s = scratches.obtain();
+            int dsdx = scratches.obtain();
+            CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
+            CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+            ADD_REG_TO_REG(dsdx, s);
+            CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
+            scratches.recycle(s);
+            scratches.recycle(dsdx);
+            int dtdx = scratches.obtain();
+            t = scratches.obtain();
+            CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
+            CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+            ADD_REG_TO_REG(dtdx, t);
+            CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
+            scratches.recycle(t);
+            scratches.recycle(dtdx);
+        }
+    }
+}
+
+void GGLX86Assembler::filter8(
+    const fragment_parts_t& parts,
+    pixel_t& texel, const texture_unit_t& tmu,
+    reg_t reg_U, reg_t reg_V, pointer_t& txPtr,
+    int FRAC_BITS, Scratch& scratches)
+{
+    if (tmu.format.components != GGL_ALPHA &&
+            tmu.format.components != GGL_LUMINANCE)
+    {
+        // this is a packed format, and we don't support
+        // linear filtering (it's probably RGB 332)
+        // Should not happen with OpenGL|ES
+        MOVZX_MEM_TO_REG(OpndSize_8, txPtr.reg, 0, texel.reg);
+        MOV_REG_TO_MEM(texel.reg, texel.offset_ebp, EBP);
+        scratches.recycle(texel.reg);
+        scratches.recycle(txPtr.reg);
+        return;
+    }
+
+    // ------------------------
+
+    //int d    = scratches.obtain();
+    //int u    = scratches.obtain();
+    //int k    = scratches.obtain();
+
+    scratches.recycle(texel.reg);
+    int rt   = scratches.obtain();
+    int lb   = scratches.obtain();
+
+    // RB -> U * V
+
+    mBuilderContext.Rctx = scratches.obtain();
+    MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+    CONTEXT_LOAD(rt, generated_vars.rt);
+    CONTEXT_LOAD(lb, generated_vars.lb);
+    scratches.recycle(mBuilderContext.Rctx);
+    int pixel= scratches.obtain();
+
+    int offset = pixel;
+
+    MOV_REG_TO_REG(rt, offset);
+    ADD_REG_TO_REG(lb, offset);
+
+    int temp_reg1 = scratches.obtain();
+    int temp_reg2 = scratches.obtain();
+    // it seems that the address mode with base and scale reg cannot be encoded correctly
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, offset, 1, temp_reg1, OpndSize_8);
+    ADD_REG_TO_REG(txPtr.reg, offset);
+    MOVZX_MEM_TO_REG(OpndSize_8, offset, 0, temp_reg1);
+    // pixel is only 8-bits
+    MOV_REG_TO_REG(temp_reg1, pixel);
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_U.offset_ebp, temp_reg1);
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_V.offset_ebp, temp_reg2);
+    IMUL(temp_reg2, temp_reg1);
+    MOVSX_REG_TO_REG(OpndSize_16, pixel, pixel);
+    MOVSX_REG_TO_REG(OpndSize_16, temp_reg1, temp_reg2);
+    IMUL(temp_reg2, pixel);
+    NEG(temp_reg1);
+    ADD_IMM_TO_REG(1<<(FRAC_BITS*2), temp_reg1);
+    mCurSp = mCurSp - 4;
+    int d_offset_ebp = mCurSp;
+    MOV_REG_TO_MEM(pixel, d_offset_ebp, EBP);
+    mCurSp = mCurSp - 4;
+    int k_offset_ebp = mCurSp;
+    MOV_REG_TO_MEM(temp_reg1, k_offset_ebp, EBP);
+
+
+    // LB -> (1-U) * V
+    MOV_MEM_TO_REG(reg_U.offset_ebp, EBP, temp_reg2);
+    NEG(temp_reg2);
+    ADD_IMM_TO_REG(1<<FRAC_BITS, temp_reg2);
+    MOV_REG_TO_MEM(temp_reg2, reg_U.offset_ebp, EBP);
+
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, lb, 1, pixel, OpndSize_8);
+    ADD_REG_TO_REG(txPtr.reg, lb);
+    MOVZX_MEM_TO_REG(OpndSize_8, lb, 0, pixel);
+
+    MOVSX_REG_TO_REG(OpndSize_16, temp_reg2, temp_reg2);
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_V.offset_ebp, temp_reg1);
+    IMUL(temp_reg1, temp_reg2);
+    MOVSX_REG_TO_REG(OpndSize_16, pixel, pixel);
+    MOVSX_REG_TO_REG(OpndSize_16, temp_reg2, temp_reg1);
+    IMUL(pixel, temp_reg1);
+    ADD_REG_TO_MEM(temp_reg1, EBP, d_offset_ebp);
+    SUB_REG_TO_MEM(temp_reg2, EBP, k_offset_ebp);
+
+
+    // LT -> (1-U)*(1-V)
+    MOV_MEM_TO_REG(reg_V.offset_ebp, EBP, temp_reg2);
+    NEG(temp_reg2);
+    ADD_IMM_TO_REG(1<<FRAC_BITS, temp_reg2);
+    MOV_REG_TO_MEM(temp_reg2, reg_V.offset_ebp, EBP);
+
+    MOVZX_MEM_TO_REG(OpndSize_8, txPtr.reg, 0, pixel);
+
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_U.offset_ebp, temp_reg1);
+    MOVSX_REG_TO_REG(OpndSize_16, temp_reg2, temp_reg2);
+    IMUL(temp_reg1, temp_reg2);
+    MOVSX_REG_TO_REG(OpndSize_16, temp_reg2, temp_reg1);
+    MOVSX_REG_TO_REG(OpndSize_16, pixel, pixel);
+    IMUL(pixel, temp_reg1);
+    ADD_REG_TO_MEM(temp_reg1, EBP, d_offset_ebp);
+
+    // RT -> U*(1-V)
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, rt, 1, pixel, OpndSize_8);
+    ADD_REG_TO_REG(txPtr.reg, rt);
+    MOVZX_MEM_TO_REG(OpndSize_8, rt, 0, pixel);
+
+    int k = rt;
+    MOV_MEM_TO_REG(k_offset_ebp, EBP, k);
+    SUB_REG_TO_REG(temp_reg2, k);
+    MOVSX_REG_TO_REG(OpndSize_16, pixel, pixel);
+    MOVSX_REG_TO_REG(OpndSize_16, k, k);
+    IMUL(pixel, k);
+    ADD_MEM_TO_REG(EBP, d_offset_ebp, k);
+    MOV_REG_TO_MEM(k, texel.offset_ebp, EBP);
+    scratches.recycle(rt);
+    scratches.recycle(lb);
+    scratches.recycle(pixel);
+    scratches.recycle(txPtr.reg);
+    scratches.recycle(temp_reg1);
+    scratches.recycle(temp_reg2);
+    for (int i=0 ; i<4 ; i++) {
+        if (!texel.format.c[i].h) continue;
+        texel.format.c[i].h = FRAC_BITS*2+8;
+        texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
+    }
+    texel.format.size = 4;
+    texel.format.bitsPerPixel = 32;
+    texel.flags |= CLEAR_LO;
+}
+
+void GGLX86Assembler::filter16(
+    const fragment_parts_t& parts,
+    pixel_t& texel, const texture_unit_t& tmu,
+    reg_t reg_U, reg_t reg_V, pointer_t& txPtr,
+    int FRAC_BITS, Scratch& scratches)
+{
+    // compute the mask
+    // XXX: it would be nice if the mask below could be computed
+    // automatically.
+    uint32_t mask = 0;
+    int shift = 0;
+    int prec = 0;
+    switch (tmu.format_idx) {
+    case GGL_PIXEL_FORMAT_RGB_565:
+        // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
+        // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
+        mask = 0x07E0F81F;
+        shift = 16;
+        prec = 5;
+        break;
+    case GGL_PIXEL_FORMAT_RGBA_4444:
+        // 0000,1111,0000,1111 | 0000,1111,0000,1111
+        mask = 0x0F0F0F0F;
+        shift = 12;
+        prec = 4;
+        break;
+    case GGL_PIXEL_FORMAT_LA_88:
+        // 0000,0000,1111,1111 | 0000,0000,1111,1111
+        // AALL -> 00AA | 00LL
+        mask = 0x00FF00FF;
+        shift = 8;
+        prec = 8;
+        break;
+    default:
+        // unsupported format, do something sensical...
+        ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
+        MOVZX_MEM_TO_REG(OpndSize_16, txPtr.reg, 0, texel.reg);
+        MOV_REG_TO_MEM(texel.reg, texel.offset_ebp, EBP);
+        scratches.recycle(texel.reg);
+        scratches.recycle(txPtr.reg);
+        return;
+    }
+
+    const int adjust = FRAC_BITS*2 - prec;
+    const int round  = 0;
+
+    // update the texel format
+    texel.format.size = 4;
+    texel.format.bitsPerPixel = 32;
+    texel.flags |= CLEAR_HI|CLEAR_LO;
+    for (int i=0 ; i<4 ; i++) {
+        if (!texel.format.c[i].h) continue;
+        const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
+        texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
+        texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
+    }
+
+    // ------------------------
+
+    scratches.recycle(texel.reg);
+
+    int pixel= scratches.obtain();
+    int u    = scratches.obtain();
+    int temp_reg1 = scratches.obtain();
+
+    // RB -> U * V
+    //printf("RB ->  U * V \n");
+    int offset = pixel;
+    mBuilderContext.Rctx = scratches.obtain();
+    MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    CONTEXT_LOAD(u, generated_vars.lb);
+    ADD_REG_TO_REG(u, offset);
+
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, offset, 1, temp_reg1, OpndSize_16);
+    ADD_REG_TO_REG(txPtr.reg, offset);
+    MOVZX_MEM_TO_REG(OpndSize_16, offset, 0, temp_reg1);
+
+    MOV_REG_TO_REG(temp_reg1, pixel);
+
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_U.offset_ebp, u);
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_V.offset_ebp, temp_reg1);
+    IMUL(temp_reg1, u);
+    MOV_REG_TO_REG(pixel, temp_reg1);
+    SHL(shift, temp_reg1);
+    OR_REG_TO_REG(temp_reg1, pixel);
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD_IMM_TO_REG(1<<(adjust-1), u);
+        SHR(adjust, u);
+    }
+    int d = scratches.obtain();
+    MOV_REG_TO_REG(u, d);
+    IMUL(pixel, d);
+    NEG(u);
+    ADD_IMM_TO_REG(1<<prec, u);
+
+
+    // LB -> (1-U) * V
+    //printf("LB -> (1- U) * V \n");
+    MOV_MEM_TO_REG(reg_U.offset_ebp, EBP, temp_reg1);
+    NEG(temp_reg1);
+    ADD_IMM_TO_REG(1<<FRAC_BITS, temp_reg1);
+    MOV_REG_TO_MEM(temp_reg1, reg_U.offset_ebp, EBP);
+    MOVSX_REG_TO_REG(OpndSize_16, temp_reg1, temp_reg1);
+
+    CONTEXT_LOAD(offset, generated_vars.lb);
+    scratches.recycle(mBuilderContext.Rctx);
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, offset, 1, pixel, OpndSize_16);
+    ADD_REG_TO_REG(txPtr.reg, offset);
+    MOVZX_MEM_TO_REG(OpndSize_16, offset, 0, pixel);
+
+    int temp_reg2 = scratches.obtain();
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_V.offset_ebp, temp_reg2);
+    IMUL(temp_reg1, temp_reg2);
+    MOV_REG_TO_REG(pixel, temp_reg1);
+    SHL(shift, temp_reg1);
+    OR_REG_TO_REG(temp_reg1, pixel);
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD_IMM_TO_REG(1<<(adjust-1), temp_reg2);
+        SHR(adjust, temp_reg2);
+    }
+    IMUL(temp_reg2, pixel);
+    ADD_REG_TO_REG(pixel, d);
+    SUB_REG_TO_REG(temp_reg2, u);
+
+
+    // LT -> (1-U)*(1-V)
+    //printf("LT -> (1- U)*(1-V) \n");
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_V.offset_ebp, temp_reg2);
+    NEG(temp_reg2);
+    ADD_IMM_TO_REG(1<<FRAC_BITS, temp_reg2);
+    MOV_REG_TO_MEM(temp_reg2, reg_V.offset_ebp, EBP);
+    MOVZX_MEM_TO_REG(OpndSize_16, txPtr.reg, 0, pixel);
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_U.offset_ebp, temp_reg1);
+    IMUL(temp_reg1, temp_reg2);
+    MOV_REG_TO_REG(pixel, temp_reg1);
+    SHL(shift, temp_reg1);
+    OR_REG_TO_REG(temp_reg1, pixel);
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD_IMM_TO_REG(1<<(adjust-1), temp_reg2);
+        SHR(adjust, temp_reg2);
+    }
+    IMUL(temp_reg2, pixel);
+    ADD_REG_TO_REG(pixel, d);
+
+
+    // RT -> U*(1-V)
+    //printf("RT -> U*(1-V) \n");
+    SUB_REG_TO_REG(temp_reg2, u);
+    mBuilderContext.Rctx = temp_reg2;
+    MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+    CONTEXT_LOAD(temp_reg1, generated_vars.rt);
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, temp_reg1, 1, pixel, OpndSize_16);
+    ADD_REG_TO_REG(txPtr.reg, temp_reg1);
+    MOVZX_MEM_TO_REG(OpndSize_16, temp_reg1, 0, pixel);
+
+    MOV_REG_TO_REG(pixel, temp_reg1);
+    SHL(shift, temp_reg1);
+    OR_REG_TO_REG(temp_reg1, pixel);
+    build_and_immediate(pixel, pixel, mask, 32);
+    IMUL(u, pixel);
+    ADD_REG_TO_REG(pixel, d);
+    MOV_REG_TO_MEM(d, texel.offset_ebp, EBP);
+    scratches.recycle(d);
+    scratches.recycle(pixel);
+    scratches.recycle(u);
+    scratches.recycle(txPtr.reg);
+    scratches.recycle(temp_reg1);
+    scratches.recycle(temp_reg2);
+}
+
+void GGLX86Assembler::filter24(
+    const fragment_parts_t& parts,
+    pixel_t& texel, const texture_unit_t& tmu,
+    int U, int V, pointer_t& txPtr,
+    int FRAC_BITS)
+{
+    // not supported yet (currently disabled)
+    load(txPtr, texel, 0);
+}
+
+void GGLX86Assembler::filter32(
+    const fragment_parts_t& parts,
+    pixel_t& texel, const texture_unit_t& tmu,
+    reg_t reg_U, reg_t reg_V, pointer_t& txPtr,
+    int FRAC_BITS, Scratch& scratches)
+{
+    const int adjust = FRAC_BITS*2 - 8;
+    const int round  = 0;
+
+    // ------------------------
+    scratches.recycle(texel.reg);
+    int mask = scratches.obtain();
+    int pixel= scratches.obtain();
+    int u    = scratches.obtain();
+
+    //int dh   = scratches.obtain();
+    //int k    = scratches.obtain();
+    //int temp = scratches.obtain();
+    //int dl   = scratches.obtain();
+
+    MOV_IMM_TO_REG(0xFF, mask);
+    OR_IMM_TO_REG(0xFF0000, mask);
+
+    // RB -> U * V
+    int offset = pixel;
+    mBuilderContext.Rctx = scratches.obtain();
+    MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    CONTEXT_LOAD(u, generated_vars.lb);
+    ADD_REG_TO_REG(u, offset);
+    scratches.recycle(mBuilderContext.Rctx);
+
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, offset, 1, u);
+    ADD_REG_TO_REG(txPtr.reg, offset);
+    MOV_MEM_TO_REG(0, offset, u);
+
+    MOV_REG_TO_REG(u, pixel);
+
+    int temp_reg1  = scratches.obtain();
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_U.offset_ebp, temp_reg1);
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_V.offset_ebp, u);
+    IMUL(temp_reg1, u);
+    MOV_REG_TO_REG(mask, temp_reg1);
+    AND_REG_TO_REG(pixel, temp_reg1);
+    if (adjust) {
+        if (round)
+            ADD_IMM_TO_REG(1<<(adjust-1), u);
+        SHR(adjust, u);
+    }
+    int temp_reg2  = scratches.obtain();
+    MOV_REG_TO_REG(temp_reg1, temp_reg2);
+    IMUL(u, temp_reg2);
+    SHR(8, pixel);
+    AND_REG_TO_REG(mask, pixel);
+    IMUL(u, pixel);
+    NEG(u);
+    ADD_IMM_TO_REG(0x100, u);
+    mCurSp = mCurSp - 4;
+    int dh_offset_ebp = mCurSp;
+    MOV_REG_TO_MEM(temp_reg2, dh_offset_ebp, EBP);
+    mCurSp = mCurSp - 4;
+    int dl_offset_ebp = mCurSp;
+    MOV_REG_TO_MEM(pixel, dl_offset_ebp, EBP);
+
+    // LB -> (1-U) * V
+    mBuilderContext.Rctx = temp_reg2;
+    MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+    CONTEXT_LOAD(offset, generated_vars.lb);
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, offset, 1, temp_reg2);
+    ADD_REG_TO_REG(txPtr.reg, offset);
+    MOV_MEM_TO_REG(0, offset, temp_reg2);
+
+    MOV_REG_TO_REG(temp_reg2, pixel);
+    MOV_MEM_TO_REG(reg_U.offset_ebp, EBP, temp_reg1);
+    NEG(temp_reg1);
+    ADD_IMM_TO_REG(1<<FRAC_BITS, temp_reg1);
+    MOV_REG_TO_MEM(temp_reg1, reg_U.offset_ebp, EBP);
+    MOVSX_REG_TO_REG(OpndSize_16, temp_reg1, temp_reg1);
+    MOVSX_MEM_TO_REG(OpndSize_16, EBP, reg_V.offset_ebp, temp_reg2);
+    IMUL(temp_reg2, temp_reg1);
+    MOV_REG_TO_REG(mask, temp_reg2);
+    AND_REG_TO_REG(pixel, temp_reg2);
+    if (adjust) {
+        if (round)
+            ADD_IMM_TO_REG(1<<(adjust-1), temp_reg1);
+        SHR(adjust, temp_reg1);
+    }
+    // if we use push and pop txPtr.reg later, It will cause the bad locality, since the esp is already been subtracted before the loop.
+    // we will spill txPtr.reg due to the limited register
+    mCurSp = mCurSp - 4;
+    int txPtr_offset_ebp = mCurSp;
+    MOV_REG_TO_MEM(txPtr.reg, txPtr_offset_ebp, EBP);
+    //PUSH(txPtr.reg);
+
+    int temp_reg3 = txPtr.reg;
+    MOV_REG_TO_REG(temp_reg2, temp_reg3);
+    IMUL(temp_reg1, temp_reg3);
+    ADD_REG_TO_MEM(temp_reg3, EBP, dh_offset_ebp);
+    SHR(8, pixel);
+    AND_REG_TO_REG(mask, pixel);
+    IMUL(temp_reg1, pixel);
+    ADD_REG_TO_MEM(pixel, EBP, dl_offset_ebp);
+    SUB_REG_TO_REG(temp_reg1, u);
+
+
+    // LT -> (1-U)*(1-V)
+    MOV_MEM_TO_REG(reg_V.offset_ebp, EBP, temp_reg1);
+    NEG(temp_reg1);
+    ADD_IMM_TO_REG(1<<FRAC_BITS, temp_reg1);
+    MOV_REG_TO_MEM(temp_reg1, reg_V.offset_ebp, EBP);
+    MOV_MEM_TO_REG(reg_U.offset_ebp, EBP, temp_reg2);
+
+    MOV_MEM_TO_REG(txPtr_offset_ebp, EBP, txPtr.reg);
+    //POP(txPtr.reg);
+
+    MOV_MEM_TO_REG(0, txPtr.reg, pixel);
+    IMUL(temp_reg2, temp_reg1);
+    //we have already saved txPtr.reg
+    temp_reg3 = txPtr.reg;
+    MOV_REG_TO_REG(pixel, temp_reg3);
+    AND_REG_TO_REG(mask, temp_reg3);
+    if (adjust) {
+        if (round)
+            ADD_IMM_TO_REG(1<<(adjust-1), temp_reg1);
+        SHR(adjust, temp_reg1);
+    }
+    IMUL(temp_reg1, temp_reg3);
+    ADD_REG_TO_MEM(temp_reg3, EBP, dh_offset_ebp);
+    SHR(8, pixel);
+    AND_REG_TO_REG(mask, pixel);
+    IMUL(temp_reg1, pixel);
+    ADD_REG_TO_MEM(pixel, EBP, dl_offset_ebp);
+
+    // RT -> U*(1-V)
+    SUB_REG_TO_REG(temp_reg1, u);
+    mBuilderContext.Rctx = temp_reg2;
+    MOV_MEM_TO_REG(8, EBP, mBuilderContext.Rctx);
+    CONTEXT_LOAD(offset, generated_vars.rt);
+
+    MOV_MEM_TO_REG(txPtr_offset_ebp, EBP, txPtr.reg);
+    //POP(txPtr.reg);
+
+    //MOV_MEM_SCALE_TO_REG(txPtr.reg, offset, 1, temp_reg2);
+    ADD_REG_TO_REG(txPtr.reg, offset);
+    MOV_MEM_TO_REG(0, offset, temp_reg2);
+
+    MOV_REG_TO_REG(temp_reg2, pixel);
+    AND_REG_TO_REG(mask, temp_reg2);
+    IMUL(u, temp_reg2);
+    ADD_REG_TO_MEM(temp_reg2, EBP, dh_offset_ebp);
+    SHR(8, pixel);
+    AND_REG_TO_REG(mask, pixel);
+    IMUL(u, pixel);
+    ADD_REG_TO_MEM(pixel, EBP, dl_offset_ebp);
+    MOV_MEM_TO_REG(dh_offset_ebp, EBP, temp_reg1);
+    MOV_MEM_TO_REG(dl_offset_ebp, EBP, temp_reg2);
+    SHR(8, temp_reg1);
+    AND_REG_TO_REG(mask, temp_reg1);
+    SHL(8, mask);
+    AND_REG_TO_REG(mask, temp_reg2);
+    OR_REG_TO_REG(temp_reg1, temp_reg2);
+    MOV_REG_TO_MEM(temp_reg2, texel.offset_ebp, EBP);
+    scratches.recycle(u);
+    scratches.recycle(mask);
+    scratches.recycle(pixel);
+    scratches.recycle(txPtr.reg);
+    scratches.recycle(temp_reg1);
+    scratches.recycle(temp_reg2);
+
+}
+
+void GGLX86Assembler::build_texture_environment(
+    component_t& fragment,
+    fragment_parts_t& parts,
+    int component,
+    Scratch& regs)
+{
+    const uint32_t component_mask = 1<<component;
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    Scratch scratches(registerFile());
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+
+        if (tmu.mask & component_mask) {
+            // replace or modulate with this texture
+            if ((tmu.replaced & component_mask) == 0) {
+                // not replaced by a later tmu...
+
+                pixel_t texel(parts.texel[i]);
+                if (multiTexture &&
+                        tmu.swrap == GGL_NEEDS_WRAP_11 &&
+                        tmu.twrap == GGL_NEEDS_WRAP_11)
+                {
+                    texel.reg = scratches.obtain();
+                    texel.flags |= CORRUPTIBLE;
+                    mCurSp = mCurSp - 4;
+                    texel.offset_ebp = mCurSp;
+                    comment("fetch texel (multitexture 1:1)");
+                    parts.coords[i].ptr.reg = scratches.obtain();
+                    MOV_MEM_TO_REG(parts.coords[i].ptr.offset_ebp, EBP, parts.coords[i].ptr.reg);
+                    load(parts.coords[i].ptr, texel, WRITE_BACK);
+                    MOV_REG_TO_MEM(texel.reg, texel.offset_ebp, EBP);
+                    scratches.recycle(parts.coords[i].ptr.reg);
+                } else {
+                    // the texel is already loaded in building textures
+                    texel.reg = scratches.obtain();
+                    MOV_MEM_TO_REG(texel.offset_ebp, EBP, texel.reg);
+                }
+
+                component_t incoming(fragment);
+                modify(fragment, regs);
+
+                switch (tmu.env) {
+                case GGL_REPLACE:
+                    extract(fragment, texel, component);
+                    break;
+                case GGL_MODULATE:
+                    modulate(fragment, incoming, texel, component);
+                    break;
+                case GGL_DECAL:
+                    decal(fragment, incoming, texel, component);
+                    break;
+                case GGL_BLEND:
+                    blend(fragment, incoming, texel, component, i);
+                    break;
+                case GGL_ADD:
+                    add(fragment, incoming, texel, component);
+                    break;
+                }
+                scratches.recycle(texel.reg);
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::wrapping(
+    int d,
+    int coord, int size,
+    int tx_wrap, int tx_linear, Scratch& scratches)
+{
+    // coord is recycled after return, so it can be written.
+    // notes:
+    // if tx_linear is set, we need 4 extra bits of precision on the result
+    // SMULL/UMULL is 3 cycles
+    // coord is actually s.reg or t.reg which will not be used
+    int c = coord;
+    if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
+        // UMULL takes 4 cycles (interlocked), and we can get away with
+        // 2 cycles using SMULWB, but we're loosing 16 bits of precision
+        // out of 32 (this is not a problem because the iterator keeps
+        // its full precision)
+        // UMULL(AL, 0, size, d, c, size);
+        // note: we can't use SMULTB because it's signed.
+        MOV_REG_TO_REG(c, d);
+        SHR(16-tx_linear, d);
+        int temp_reg;
+        if(c != EDX)
+            temp_reg = c;
+        else {
+            temp_reg = scratches.obtain();
+            scratches.recycle(c);
+        }
+        int flag_push_edx = -1;
+        int flag_reserve_edx = -1;
+        int edx_offset_ebp = 0;
+        if(scratches.isUsed(EDX) == 1) { //not indicates that the registers are used up. Probably, previous allocated registers are recycled
+            if((d != EDX) && (size != EDX)) {
+                flag_push_edx = 1;
+                mCurSp = mCurSp - 4;
+                edx_offset_ebp = mCurSp;
+                MOV_REG_TO_MEM(EDX, edx_offset_ebp, EBP);
+                //PUSH(EDX);
+            }
+        }
+        else {
+            flag_reserve_edx = 1;
+            scratches.reserve(EDX);
+        }
+        if(scratches.isUsed(EAX)) {
+            if( size == EAX || d == EAX) {
+                // size is actually width and height, which will probably be used after wrapping
+                MOV_REG_TO_REG(size, temp_reg);
+                MOVSX_REG_TO_REG(OpndSize_16, size, size);
+                if(size == EAX)
+                    IMUL(d);
+                else
+                    IMUL(size);
+                SHL(16, EDX);
+                SHR(16, EAX);
+                MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                MOV_REG_TO_REG(EDX, d);
+
+                MOV_REG_TO_REG(temp_reg, size);
+            }
+            else {
+                if(temp_reg != EAX)
+                    MOV_REG_TO_REG(EAX, temp_reg);
+                MOV_REG_TO_REG(size, EAX);
+                MOVSX_REG_TO_REG(OpndSize_16, EAX, EAX);
+                IMUL(d);
+                SHL(16, EDX);
+                SHR(16, EAX);
+                MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+                MOV_REG_TO_REG(EDX, d);
+                if(temp_reg != EAX)
+                    MOV_REG_TO_REG(temp_reg, EAX);
+            }
+        }
+        else {
+            MOV_REG_TO_REG(size, EAX);
+            MOVSX_REG_TO_REG(OpndSize_16, EAX, EAX);
+            IMUL(d);
+            SHL(16, EDX);
+            SHR(16, EAX);
+            MOV_REG_TO_REG(EAX, EDX, OpndSize_16);
+            MOV_REG_TO_REG(EDX, d);
+        }
+        if(flag_push_edx == 1) {
+            MOV_MEM_TO_REG(edx_offset_ebp, EBP, EDX);
+            //POP(EDX);
+        }
+        if(flag_reserve_edx ==1)
+            scratches.recycle(EDX);
+
+        scratches.recycle(temp_reg);
+        //IMUL(size, d) will cause segmentation fault with GlobalTime
+    } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
+        if (tx_linear) {
+            // 1 cycle
+            MOV_REG_TO_REG(coord, d);
+            SAR(16-tx_linear, d);
+        } else {
+            SAR(16, coord);
+            MOV_REG_TO_REG(coord, d);
+            SAR(31, coord);
+            NOT(coord);
+            AND_REG_TO_REG(coord, d);
+
+            MOV_REG_TO_REG(size, coord);
+            SUB_IMM_TO_REG(1, coord);
+
+            CMP_REG_TO_REG(size, d);
+            CMOV_REG_TO_REG(Mnemonic_CMOVGE, coord, d);
+
+        }
+        scratches.recycle(coord);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLX86Assembler::modulate(
+    component_t& dest,
+    const component_t& incoming,
+    const pixel_t& incomingTexel, int component)
+{
+    Scratch locals(registerFile());
+    integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
+    extract(texel, incomingTexel, component);
+
+    const int Nt = texel.size();
+    // Nt should always be less than 10 bits because it comes
+    // from the TMU.
+
+    int Ni = incoming.size();
+    // Ni could be big because it comes from previous MODULATEs
+
+    if (Nt == 1) {
+        // texel acts as a bit-mask
+        // dest = incoming & ((texel << incoming.h)-texel)
+        MOV_REG_TO_REG(texel.reg, dest.reg);
+        SHL(incoming.h, dest.reg);
+        SUB_REG_TO_REG(texel.reg, dest.reg);
+        dest.l = incoming.l;
+        dest.h = incoming.h;
+        dest.flags |= (incoming.flags & CLEAR_LO);
+    } else if (Ni == 1) {
+        SHL(31-incoming.h, incoming.reg);
+        MOV_REG_TO_REG(incoming.reg, dest.reg);
+        SAR(31, dest.reg);
+        AND_REG_TO_REG(texel.reg, dest.reg);
+        dest.l = 0;
+        dest.h = Nt;
+    } else {
+        int inReg = incoming.reg;
+        int shift = incoming.l;
+        if ((Nt + Ni) > 32) {
+            // we will overflow, reduce the precision of Ni to 8 bits
+            // (Note Nt cannot be more than 10 bits which happens with
+            // 565 textures and GGL_LINEAR)
+            shift += Ni-8;
+            Ni = 8;
+        }
+
+        // modulate by the component with the lowest precision
+        if (Nt >= Ni) {
+            if (shift) {
+                // XXX: we should be able to avoid this shift
+                // when shift==16 && Nt<16 && Ni<16, in which
+                // we could use SMULBT below.
+                MOV_REG_TO_REG(inReg, dest.reg);
+                SHR(shift, inReg);
+                inReg = dest.reg;
+                shift = 0;
+            }
+            int temp_reg = locals.obtain();
+            // operation:           (Cf*Ct)/((1<<Ni)-1)
+            // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
+            // this operation doesn't change texel's size
+            MOV_REG_TO_REG(inReg, temp_reg);
+            SHR(Ni-1, temp_reg);
+            MOV_REG_TO_REG(inReg, dest.reg);
+            ADD_REG_TO_REG(temp_reg, dest.reg);
+            locals.recycle(temp_reg);
+            if (Nt<16 && Ni<16) {
+                MOVSX_REG_TO_REG(OpndSize_16, texel.reg, texel.reg);
+                MOVSX_REG_TO_REG(OpndSize_16, dest.reg, dest.reg);
+                IMUL(texel.reg, dest.reg);
+            }
+            else
+                IMUL(texel.reg, dest.reg);
+            dest.l = Ni;
+            dest.h = Nt + Ni;
+        } else {
+            if (shift && (shift != 16)) {
+                // if shift==16, we can use 16-bits mul instructions later
+                MOV_REG_TO_REG(inReg, dest.reg);
+                SHR(shift, dest.reg);
+                inReg = dest.reg;
+                shift = 0;
+            }
+            // operation:           (Cf*Ct)/((1<<Nt)-1)
+            // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
+            // this operation doesn't change incoming's size
+            Scratch scratches(registerFile());
+            int temp_reg = locals.obtain();
+            int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
+            if (t == inReg)
+                t = scratches.obtain();
+
+            MOV_REG_TO_REG(texel.reg, temp_reg);
+            SHR(Nt-1, temp_reg);
+            ADD_REG_TO_REG(temp_reg, texel.reg);
+            MOV_REG_TO_REG(texel.reg, t);
+            locals.recycle(temp_reg);
+            MOV_REG_TO_REG(inReg, dest.reg);
+            if (Nt<16 && Ni<16) {
+                if (shift==16) {
+                    MOVSX_REG_TO_REG(OpndSize_16, t, t);
+                    SHR(16, dest.reg);
+                    MOVSX_REG_TO_REG(OpndSize_16, dest.reg, dest.reg);
+                    IMUL(t, dest.reg);
+                }
+                else {
+                    MOVSX_REG_TO_REG(OpndSize_16, dest.reg, dest.reg);
+                    MOVSX_REG_TO_REG(OpndSize_16, t, t);
+                    IMUL(t, dest.reg);
+                }
+            } else
+                IMUL(t, dest.reg);
+            dest.l = Nt;
+            dest.h = Nt + Ni;
+        }
+
+        // low bits are not valid
+        dest.flags |= CLEAR_LO;
+
+        // no need to keep more than 8 bits/component
+        if (dest.size() > 8)
+            dest.l = dest.h-8;
+    }
+}
+
+void GGLX86Assembler::decal(
+    component_t& dest,
+    const component_t& incoming,
+    const pixel_t& incomingTexel, int component)
+{
+    // RGBA:
+    // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
+    // Av = Af
+    Scratch locals(registerFile());
+    integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
+    integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
+    extract(texel, incomingTexel, component);
+    extract(factor, incomingTexel, GGLFormat::ALPHA);
+
+    // no need to keep more than 8-bits for decal
+    int Ni = incoming.size();
+    int shift = incoming.l;
+    if (Ni > 8) {
+        shift += Ni-8;
+        Ni = 8;
+    }
+    integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
+    if (shift) {
+        SHR(shift, incomingNorm.reg);
+        MOV_REG_TO_REG(incomingNorm.reg, dest.reg);
+        incomingNorm.reg = dest.reg;
+        incomingNorm.flags |= CORRUPTIBLE;
+    }
+    int temp = locals.obtain();
+    MOV_REG_TO_REG(factor.reg, temp);
+    SHR(factor.s-1, temp);
+    ADD_REG_TO_REG(temp, factor.reg);
+    locals.recycle(temp);
+    build_blendOneMinusFF(dest, factor, incomingNorm, texel);
+}
+
+void GGLX86Assembler::blend(
+    component_t& dest,
+    const component_t& incoming,
+    const pixel_t& incomingTexel, int component, int tmu)
+{
+    // RGBA:
+    // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
+    // Av = At*Af
+
+    if (component == GGLFormat::ALPHA) {
+        modulate(dest, incoming, incomingTexel, component);
+        return;
+    }
+
+    Scratch locals(registerFile());
+    int temp = locals.obtain();
+    integer_t color(locals.obtain(), 8, CORRUPTIBLE);
+    integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
+    mBuilderContext.Rctx = temp;
+    MOV_MEM_TO_REG(8, PhysicalReg_EBP, mBuilderContext.Rctx);
+    MOVZX_MEM_TO_REG(OpndSize_8, mBuilderContext.Rctx, GGL_OFFSETOF(state.texture[tmu].env_color[component]), color.reg);
+    extract(factor, incomingTexel, component);
+
+    // no need to keep more than 8-bits for blend
+    int Ni = incoming.size();
+    int shift = incoming.l;
+    if (Ni > 8) {
+        shift += Ni-8;
+        Ni = 8;
+    }
+    integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
+    if (shift) {
+        MOV_REG_TO_REG(incomingNorm.reg, dest.reg);
+        SHR(shift, dest.reg);
+        incomingNorm.reg = dest.reg;
+        incomingNorm.flags |= CORRUPTIBLE;
+    }
+    MOV_REG_TO_REG(factor.reg, temp);
+    SHR(factor.s-1, temp);
+    ADD_REG_TO_REG(temp, factor.reg);
+    locals.recycle(temp);
+    build_blendOneMinusFF(dest, factor, incomingNorm, color);
+}
+
+void GGLX86Assembler::add(
+    component_t& dest,
+    const component_t& incoming,
+    const pixel_t& incomingTexel, int component)
+{
+    // RGBA:
+    // Cv = Cf + Ct;
+    Scratch locals(registerFile());
+
+    component_t incomingTemp(incoming);
+
+    // use "dest" as a temporary for extracting the texel, unless "dest"
+    // overlaps "incoming".
+    integer_t texel(dest.reg, 32, CORRUPTIBLE);
+    if (dest.reg == incomingTemp.reg)
+        texel.reg = locals.obtain();
+    extract(texel, incomingTexel, component);
+
+    if (texel.s < incomingTemp.size()) {
+        expand(texel, texel, incomingTemp.size());
+    } else if (texel.s > incomingTemp.size()) {
+        if (incomingTemp.flags & CORRUPTIBLE) {
+            expand(incomingTemp, incomingTemp, texel.s);
+        } else {
+            incomingTemp.reg = locals.obtain();
+            expand(incomingTemp, incoming, texel.s);
+        }
+    }
+
+    if (incomingTemp.l) {
+        MOV_REG_TO_REG(incomingTemp.reg, dest.reg);
+        SHR(incomingTemp.l, dest.reg);
+        ADD_REG_TO_REG(texel.reg, dest.reg);
+    } else {
+        MOV_REG_TO_REG(incomingTemp.reg, dest.reg);
+        ADD_REG_TO_REG(texel.reg, dest.reg);
+    }
+    dest.l = 0;
+    dest.h = texel.size();
+    int temp_reg = locals.obtain();
+    component_sat(dest, temp_reg);
+    locals.recycle(temp_reg);
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
diff --git a/libpixelflinger/pixelflinger.cpp b/libpixelflinger/pixelflinger.cpp
index fd449b2..f06154f 100644
--- a/libpixelflinger/pixelflinger.cpp
+++ b/libpixelflinger/pixelflinger.cpp
@@ -32,7 +32,11 @@
 #include "scanline.h"
 #include "trap.h"
 
+#if defined(__i386__) || defined(__x86_64__)
+#include "codeflinger/x86/GGLX86Assembler.h"
+#else
 #include "codeflinger/GGLAssembler.h"
+#endif
 #include "codeflinger/CodeCache.h"
 
 #include <stdio.h> 
diff --git a/libpixelflinger/scanline.cpp b/libpixelflinger/scanline.cpp
index 3d14531..5ef932b 100644
--- a/libpixelflinger/scanline.cpp
+++ b/libpixelflinger/scanline.cpp
@@ -34,7 +34,12 @@
 #include "scanline.h"
 
 #include "codeflinger/CodeCache.h"
+#if defined(__i386__) || defined(__x86_64__)
+#include "codeflinger/x86/GGLX86Assembler.h"
+#include "codeflinger/x86/X86Assembler.h"
+#else
 #include "codeflinger/GGLAssembler.h"
+#endif
 #if defined(__arm__)
 #include "codeflinger/ARMAssembler.h"
 #elif defined(__aarch64__)
@@ -61,6 +66,8 @@
 
 #if defined(__arm__) || (defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6) || defined(__aarch64__)
 #   define ANDROID_ARM_CODEGEN  1
+#elif defined(__i386__)
+#   define ANDROID_IA32_CODEGEN 1
 #else
 #   define ANDROID_ARM_CODEGEN  0
 #endif
@@ -284,7 +291,7 @@ static  const needs_filter_t fill16noblend = {
 
 // ----------------------------------------------------------------------------
 
-#if ANDROID_ARM_CODEGEN
+#if ANDROID_ARM_CODEGEN || ANDROID_IA32_CODEGEN
 
 #if defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
 static CodeCache gCodeCache(32 * 1024);
@@ -316,7 +323,7 @@ void ggl_uninit_scanline(context_t* c)
 {
     if (c->state.buffers.coverage)
         free(c->state.buffers.coverage);
-#if ANDROID_ARM_CODEGEN
+#if ANDROID_ARM_CODEGEN || ANDROID_IA32_CODEGEN
     if (c->scanline_as)
         c->scanline_as->decStrong(c);
 #endif
@@ -436,6 +443,39 @@ static void pick_scanline(context_t* c)
     c->scanline_as = assembly.get();
     c->scanline_as->incStrong(c); //  hold on to assembly
     c->scanline = (void(*)(context_t* c))assembly->base();
+#elif ANDROID_IA32_CODEGEN
+    const AssemblyKey<needs_t> key(c->state.needs);
+    sp<Assembly> assembly = gCodeCache.lookup(key);
+    if (assembly == 0) {
+        // create a new assembly region
+        sp<ScanlineAssembly> a = new ScanlineAssembly(c->state.needs,
+                ASSEMBLY_SCRATCH_SIZE);
+        // initialize our assembler
+        GGLX86Assembler assembler( a );
+        // generate the scanline code for the given needs
+        int err = assembler.scanline(c->state.needs, c);
+        if (ggl_likely(!err)) {
+            // finally, cache this assembly
+            err = gCodeCache.cache(a->key(), a);
+        }
+        if (ggl_unlikely(err)) {
+            ALOGE("error generating or caching assembly. Reverting to NOP.  cache_err: %d \n", err);
+            c->scanline = scanline_noop;
+            c->init_y = init_y_noop;
+            c->step_y = step_y__nop;
+            return;
+        }
+        assembly = a;
+    }
+
+    // release the previous assembly
+    if (c->scanline_as) {
+        c->scanline_as->decStrong(c);
+    }
+
+    c->scanline_as = assembly.get();
+    c->scanline_as->incStrong(c); //  hold on to assembly
+    c->scanline = (void(*)(context_t* c))assembly->base();
 #else
 //    ALOGW("using generic (slow) pixel-pipeline");
     c->scanline = scanline;
@@ -464,7 +504,7 @@ static void blend_factor(context_t* c, pixel_t* r, uint32_t factor,
         const pixel_t* src, const pixel_t* dst);
 static void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv);
 
-#if ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
+#if (ANDROID_ARM_CODEGEN || ANDROID_IA32_CODEGEN) && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
 
 // no need to compile the generic-pipeline, it can't be reached
 void scanline(context_t*)
@@ -939,7 +979,7 @@ discard:
 	}
 }
 
-#endif // ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
+#endif // (ANDROID_ARM_CODEGEN || ANDROID_IA32_CODEGEN) && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
 
 // ----------------------------------------------------------------------------
 #if 0
diff --git a/libpixelflinger/tests/codegen/Android.mk b/libpixelflinger/tests/codegen/Android.mk
index 2f9ca2f..045a68e 100644
--- a/libpixelflinger/tests/codegen/Android.mk
+++ b/libpixelflinger/tests/codegen/Android.mk
@@ -1,8 +1,13 @@
 LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
 
+ifeq ($(TARGET_ARCH),x86)
 LOCAL_SRC_FILES:= \
-	codegen.cpp.arm
+    codegen.cpp
+else
+LOCAL_SRC_FILES:= \
+    codegen.cpp.arm
+endif
 
 LOCAL_SHARED_LIBRARIES := \
 	libcutils \
@@ -11,6 +16,10 @@ LOCAL_SHARED_LIBRARIES := \
 LOCAL_C_INCLUDES := \
 	$(LOCAL_PATH)/../..
 
+ifeq ($(TARGET_ARCH),x86)
+LOCAL_STATIC_LIBRARIES := libenc
+endif
+
 LOCAL_MODULE:= test-opengl-codegen
 
 LOCAL_MODULE_TAGS := tests
diff --git a/libpixelflinger/tests/codegen/codegen.cpp b/libpixelflinger/tests/codegen/codegen.cpp
index 148b6f4..f5c7136 100644
--- a/libpixelflinger/tests/codegen/codegen.cpp
+++ b/libpixelflinger/tests/codegen/codegen.cpp
@@ -7,15 +7,22 @@
 #include "scanline.h"
 
 #include "codeflinger/CodeCache.h"
+#if defined(__i386__) || defined(__x86_64__)
+#include "codeflinger/x86/GGLX86Assembler.h"
+#include "codeflinger/x86/X86Assembler.h"
+#else
 #include "codeflinger/GGLAssembler.h"
 #include "codeflinger/ARMAssembler.h"
 #if defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
 #include "codeflinger/MIPSAssembler.h"
 #endif
 #include "codeflinger/Arm64Assembler.h"
+#endif
 
 #if defined(__arm__) || (defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6) || defined(__aarch64__)
 #   define ANDROID_ARM_CODEGEN  1
+#elif defined(__i386__)
+#   define ANDROID_IA32_CODEGEN 1
 #else
 #   define ANDROID_ARM_CODEGEN  0
 #endif
@@ -40,7 +47,6 @@ public:
 
 static void ggl_test_codegen(uint32_t n, uint32_t p, uint32_t t0, uint32_t t1)
 {
-#if ANDROID_ARM_CODEGEN
     GGLContext* c;
     gglInit(&c);
     needs_t needs;
@@ -48,6 +54,7 @@ static void ggl_test_codegen(uint32_t n, uint32_t p, uint32_t t0, uint32_t t1)
     needs.p = p;
     needs.t[0] = t0;
     needs.t[1] = t1;
+#if ANDROID_ARM_CODEGEN
     sp<ScanlineAssembly> a(new ScanlineAssembly(needs, ASSEMBLY_SCRATCH_SIZE));
 
 #if defined(__arm__)
@@ -66,10 +73,15 @@ static void ggl_test_codegen(uint32_t n, uint32_t p, uint32_t t0, uint32_t t1)
     if (err != 0) {
         printf("error %08x (%s)\n", err, strerror(-err));
     }
-    gglUninit(c);
-#else
-    printf("This test runs only on ARM, Arm64 or MIPS\n");
+#elif ANDROID_IA32_CODEGEN
+    sp<ScanlineAssembly> a(new ScanlineAssembly(needs, ASSEMBLY_SCRATCH_SIZE));
+    GGLX86Assembler assembler( a );
+    int err = assembler.scanline(needs, (context_t*)c);
+    if (err != 0) {
+        printf("error %08x (%s)\n", err, strerror(-err));
+    }
 #endif
+    gglUninit(c);
 }
 
 int main(int argc, char** argv)
diff --git a/logcat/logcat.cpp b/logcat/logcat.cpp
index 07d0a5c..47f0136 100644
--- a/logcat/logcat.cpp
+++ b/logcat/logcat.cpp
@@ -654,7 +654,7 @@ int main(int argc, char **argv)
             break;
 
             case 'f':
-                if ((tail_time == log_time::EPOCH) && (tail_lines != 0)) {
+                if ((tail_time == log_time::EPOCH) && (tail_lines == 0)) {
                     tail_time = lastLogTime(optarg);
                 }
                 // redirect output to a file
diff --git a/logcat/tests/logcat_test.cpp b/logcat/tests/logcat_test.cpp
index de2db67..9455d87 100644
--- a/logcat/tests/logcat_test.cpp
+++ b/logcat/tests/logcat_test.cpp
@@ -15,10 +15,12 @@
  */
 
 #include <ctype.h>
+#include <dirent.h>
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/types.h>
 
 #include <gtest/gtest.h>
 #include <log/log.h>
@@ -284,7 +286,7 @@ TEST(logcat, get_size) {
 
     while (fgets(buffer, sizeof(buffer), fp)) {
         int size, consumed, max, payload;
-        char size_mult[2], consumed_mult[2];
+        char size_mult[3], consumed_mult[3];
         long full_size, full_consumed;
 
         size = consumed = max = payload = 0;
@@ -489,12 +491,12 @@ TEST(logcat, logrotate) {
     static const char comm[] = "logcat -b radio -b events -b system -b main"
                                      " -d -f %s/log.txt -n 7 -r 1";
     char command[sizeof(buf) + sizeof(comm)];
-    sprintf(command, comm, buf);
+    snprintf(command, sizeof(command), comm, buf);
 
     int ret;
     EXPECT_FALSE((ret = system(command)));
     if (!ret) {
-        sprintf(command, "ls -s %s 2>/dev/null", buf);
+        snprintf(command, sizeof(command), "ls -s %s 2>/dev/null", buf);
 
         FILE *fp;
         EXPECT_TRUE(NULL != (fp = popen(command, "r")));
@@ -503,16 +505,12 @@ TEST(logcat, logrotate) {
             int count = 0;
 
             while (fgets(buffer, sizeof(buffer), fp)) {
-                static const char match_1[] = "4 log.txt";
-                static const char match_2[] = "8 log.txt";
-                static const char match_3[] = "12 log.txt";
-                static const char match_4[] = "16 log.txt";
                 static const char total[] = "total ";
+                int num;
+                char c;
 
-                if (!strncmp(buffer, match_1, sizeof(match_1) - 1)
-                 || !strncmp(buffer, match_2, sizeof(match_2) - 1)
-                 || !strncmp(buffer, match_3, sizeof(match_3) - 1)
-                 || !strncmp(buffer, match_4, sizeof(match_4) - 1)) {
+                if ((2 == sscanf(buffer, "%d log.tx%c", &num, &c)) &&
+                        (num <= 24)) {
                     ++count;
                 } else if (strncmp(buffer, total, sizeof(total) - 1)) {
                     fprintf(stderr, "WARNING: Parse error: %s", buffer);
@@ -522,7 +520,7 @@ TEST(logcat, logrotate) {
             EXPECT_TRUE(count == 7 || count == 8);
         }
     }
-    sprintf(command, "rm -rf %s", buf);
+    snprintf(command, sizeof(command), "rm -rf %s", buf);
     EXPECT_FALSE(system(command));
 }
 
@@ -534,12 +532,12 @@ TEST(logcat, logrotate_suffix) {
     static const char logcat_cmd[] = "logcat -b radio -b events -b system -b main"
                                      " -d -f %s/log.txt -n 10 -r 1";
     char command[sizeof(tmp_out_dir) + sizeof(logcat_cmd)];
-    sprintf(command, logcat_cmd, tmp_out_dir);
+    snprintf(command, sizeof(command), logcat_cmd, tmp_out_dir);
 
     int ret;
     EXPECT_FALSE((ret = system(command)));
     if (!ret) {
-        sprintf(command, "ls %s 2>/dev/null", tmp_out_dir);
+        snprintf(command, sizeof(command), "ls %s 2>/dev/null", tmp_out_dir);
 
         FILE *fp;
         EXPECT_TRUE(NULL != (fp = popen(command, "r")));
@@ -575,7 +573,113 @@ TEST(logcat, logrotate_suffix) {
         pclose(fp);
         EXPECT_EQ(11, log_file_count);
     }
-    sprintf(command, "rm -rf %s", tmp_out_dir);
+    snprintf(command, sizeof(command), "rm -rf %s", tmp_out_dir);
+    EXPECT_FALSE(system(command));
+}
+
+TEST(logcat, logrotate_continue) {
+    static const char tmp_out_dir_form[] = "/data/local/tmp/logcat.logrotate.XXXXXX";
+    char tmp_out_dir[sizeof(tmp_out_dir_form)];
+    ASSERT_TRUE(NULL != mkdtemp(strcpy(tmp_out_dir, tmp_out_dir_form)));
+
+    static const char log_filename[] = "log.txt";
+    static const char logcat_cmd[] = "logcat -b all -d -f %s/%s -n 256 -r 1024";
+    static const char cleanup_cmd[] = "rm -rf %s";
+    char command[sizeof(tmp_out_dir) + sizeof(logcat_cmd) + sizeof(log_filename)];
+    snprintf(command, sizeof(command), logcat_cmd, tmp_out_dir, log_filename);
+
+    int ret;
+    EXPECT_FALSE((ret = system(command)));
+    if (ret) {
+        snprintf(command, sizeof(command), cleanup_cmd, tmp_out_dir);
+        EXPECT_FALSE(system(command));
+        return;
+    }
+    FILE *fp;
+    snprintf(command, sizeof(command), "%s/%s", tmp_out_dir, log_filename);
+    EXPECT_TRUE(NULL != ((fp = fopen(command, "r"))));
+    if (!fp) {
+        snprintf(command, sizeof(command), cleanup_cmd, tmp_out_dir);
+        EXPECT_FALSE(system(command));
+        return;
+    }
+    char *line = NULL;
+    char *last_line = NULL; // this line is allowed to stutter, one-line overlap
+    char *second_last_line = NULL;
+    size_t len = 0;
+    while (getline(&line, &len, fp) != -1) {
+        free(second_last_line);
+        second_last_line = last_line;
+        last_line = line;
+        line = NULL;
+    }
+    fclose(fp);
+    free(line);
+    if (second_last_line == NULL) {
+        fprintf(stderr, "No second to last line, using last, test may fail\n");
+        second_last_line = last_line;
+        last_line = NULL;
+    }
+    free(last_line);
+    EXPECT_TRUE(NULL != second_last_line);
+    if (!second_last_line) {
+        snprintf(command, sizeof(command), cleanup_cmd, tmp_out_dir);
+        EXPECT_FALSE(system(command));
+        return;
+    }
+    // re-run the command, it should only add a few lines more content if it
+    // continues where it left off.
+    snprintf(command, sizeof(command), logcat_cmd, tmp_out_dir, log_filename);
+    EXPECT_FALSE((ret = system(command)));
+    if (ret) {
+        snprintf(command, sizeof(command), cleanup_cmd, tmp_out_dir);
+        EXPECT_FALSE(system(command));
+        return;
+    }
+    DIR *dir;
+    EXPECT_TRUE(NULL != (dir = opendir(tmp_out_dir)));
+    if (!dir) {
+        snprintf(command, sizeof(command), cleanup_cmd, tmp_out_dir);
+        EXPECT_FALSE(system(command));
+        return;
+    }
+    struct dirent *entry;
+    unsigned count = 0;
+    while ((entry = readdir(dir))) {
+        if (strncmp(entry->d_name, log_filename, sizeof(log_filename) - 1)) {
+            continue;
+        }
+        snprintf(command, sizeof(command), "%s/%s", tmp_out_dir, entry->d_name);
+        EXPECT_TRUE(NULL != ((fp = fopen(command, "r"))));
+        if (!fp) {
+            fprintf(stderr, "%s ?\n", command);
+            continue;
+        }
+        line = NULL;
+        size_t number = 0;
+        while (getline(&line, &len, fp) != -1) {
+            ++number;
+            if (!strcmp(line, second_last_line)) {
+                EXPECT_TRUE(++count <= 1);
+                fprintf(stderr, "%s(%zu):\n", entry->d_name, number);
+            }
+        }
+        fclose(fp);
+        free(line);
+        unlink(command);
+    }
+    closedir(dir);
+    if (count > 1) {
+        char *brk = strpbrk(second_last_line, "\r\n");
+        if (!brk) {
+            brk = second_last_line + strlen(second_last_line);
+        }
+        fprintf(stderr, "\"%.*s\" occured %u times\n",
+            (int)(brk - second_last_line), second_last_line, count);
+    }
+    free(second_last_line);
+
+    snprintf(command, sizeof(command), cleanup_cmd, tmp_out_dir);
     EXPECT_FALSE(system(command));
 }
 
diff --git a/rootdir/Android.mk b/rootdir/Android.mk
index 7ab76b8..836e585 100644
--- a/rootdir/Android.mk
+++ b/rootdir/Android.mk
@@ -30,16 +30,21 @@ LOCAL_POST_INSTALL_CMD := mkdir -p $(addprefix $(TARGET_ROOT_OUT)/, \
 
 include $(BUILD_SYSTEM)/base_rules.mk
 
-# Regenerate init.environ.rc if PRODUCT_BOOTCLASSPATH has changed.
-bcp_md5 := $(word 1, $(shell echo $(PRODUCT_BOOTCLASSPATH) $(PRODUCT_SYSTEM_SERVER_CLASSPATH) | $(MD5SUM)))
+# Regenerate init.environ.rc if PRODUCT_BOOTCLASSPATH or TARGET_LDPRELOAD has changed.
+bcp_md5 := $(word 1, $(shell echo $(PRODUCT_BOOTCLASSPATH) $(PRODUCT_SYSTEM_SERVER_CLASSPATH) $(TARGET_LDPRELOAD) | $(MD5SUM)))
 bcp_dep := $(intermediates)/$(bcp_md5).bcp.dep
 $(bcp_dep) :
 	$(hide) mkdir -p $(dir $@) && rm -rf $(dir $@)*.bcp.dep && touch $@
 
+ifneq ($(strip $(TARGET_LDPRELOAD)),)
+    TARGET_LDPRELOAD_STR := :$(TARGET_LDPRELOAD)
+endif
+
 $(LOCAL_BUILT_MODULE): $(LOCAL_PATH)/init.environ.rc.in $(bcp_dep)
 	@echo "Generate: $< -> $@"
 	@mkdir -p $(dir $@)
-	$(hide) sed -e 's?%BOOTCLASSPATH%?$(PRODUCT_BOOTCLASSPATH)?g' $< >$@
+	$(hide) sed -e 's?%BOOTCLASSPATH%?$(PRODUCT_BOOTCLASSPATH)?g'\
+                -e 's?%TARGET_LDPRELOAD%?$(TARGET_LDPRELOAD_STR)?g' $< >$@
 	$(hide) sed -i -e 's?%SYSTEMSERVERCLASSPATH%?$(PRODUCT_SYSTEM_SERVER_CLASSPATH)?g' $@
 
 bcp_md5 :=
diff --git a/rootdir/init.environ.rc.in b/rootdir/init.environ.rc.in
index b34ea01..46ec1fb 100644
--- a/rootdir/init.environ.rc.in
+++ b/rootdir/init.environ.rc.in
@@ -9,3 +9,4 @@ on init
     export ASEC_MOUNTPOINT /mnt/asec
     export BOOTCLASSPATH %BOOTCLASSPATH%
     export SYSTEMSERVERCLASSPATH %SYSTEMSERVERCLASSPATH%
+    export LD_PRELOAD libsigchain.so%TARGET_LDPRELOAD%
diff --git a/rootdir/init.rc b/rootdir/init.rc
index 5c6b606..78adacc 100644
--- a/rootdir/init.rc
+++ b/rootdir/init.rc
@@ -21,6 +21,8 @@ on early-init
     # Set the security context of /adb_keys if present.
     restorecon /adb_keys
 
+    mount debugfs /sys/kernel/debug /sys/kernel/debug mode=755
+
     start ueventd
 
 on init
@@ -169,13 +171,19 @@ on init
     chown system system /dev/cpuset/foreground
     chown system system /dev/cpuset/foreground/boost
     chown system system /dev/cpuset/background
+    chown system system /dev/cpuset/system-background
     chown system system /dev/cpuset/tasks
     chown system system /dev/cpuset/foreground/tasks
     chown system system /dev/cpuset/foreground/boost/tasks
     chown system system /dev/cpuset/background/tasks
+    chown system system /dev/cpuset/system-background/tasks
+
+    # set system-background to 0775 so SurfaceFlinger can touch it
+    chmod 0775 /dev/cpuset/system-background
     chmod 0664 /dev/cpuset/foreground/tasks
     chmod 0664 /dev/cpuset/foreground/boost/tasks
     chmod 0664 /dev/cpuset/background/tasks
+    chmod 0664 /dev/cpuset/system-background/tasks
     chmod 0664 /dev/cpuset/tasks
 
 
@@ -653,7 +661,6 @@ service surfaceflinger /system/bin/surfaceflinger
     user system
     group graphics drmrpc
     onrestart restart zygote
-    writepid /dev/cpuset/system-background/tasks
 
 service drm /system/bin/drmserver
     class main
@@ -759,10 +766,10 @@ on property:persist.logd.logpersistd=logcatd
     # all exec/services are called with umask(077), so no gain beyond 0700
     mkdir /data/misc/logd 0700 logd log
     # logd for write to /data/misc/logd, log group for read from pstore (-L)
-    exec - logd log -- /system/bin/logcat -L -b all -v threadtime -v usec -v printable -D -f /data/misc/logd/logcat -r 64 -n 256
+    exec - logd log -- /system/bin/logcat -L -b all -v threadtime -v usec -v printable -D -f /data/misc/logd/logcat -r 1024 -n 256
     start logcatd
 
-service logcatd /system/bin/logcat -b all -v threadtime -v usec -v printable -D -f /data/misc/logd/logcat -r 64 -n 256
+service logcatd /system/bin/logcat -b all -v threadtime -v usec -v printable -D -f /data/misc/logd/logcat -r 1024 -n 256
     class late_start
     disabled
     # logd for write to /data/misc/logd, log group for read from log daemon
diff --git a/rootdir/init.trace.rc b/rootdir/init.trace.rc
index 50944e6..4933156 100644
--- a/rootdir/init.trace.rc
+++ b/rootdir/init.trace.rc
@@ -12,6 +12,7 @@ on boot
     chown root shell /sys/kernel/debug/tracing/options/print-tgid
     chown root shell /sys/kernel/debug/tracing/events/sched/sched_switch/enable
     chown root shell /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
+    chown root shell /sys/kernel/debug/tracing/events/sched/sched_blocked_reason/enable
     chown root shell /sys/kernel/debug/tracing/events/power/cpu_frequency/enable
     chown root shell /sys/kernel/debug/tracing/events/power/cpu_idle/enable
     chown root shell /sys/kernel/debug/tracing/events/power/clock_set_rate/enable
@@ -24,6 +25,7 @@ on boot
     chmod 0664 /sys/kernel/debug/tracing/options/print-tgid
     chmod 0664 /sys/kernel/debug/tracing/events/sched/sched_switch/enable
     chmod 0664 /sys/kernel/debug/tracing/events/sched/sched_wakeup/enable
+    chmod 0664 /sys/kernel/debug/tracing/events/sched/sched_blocked_reason/enable
     chmod 0664 /sys/kernel/debug/tracing/events/power/cpu_frequency/enable
     chmod 0664 /sys/kernel/debug/tracing/events/power/cpu_idle/enable
     chmod 0664 /sys/kernel/debug/tracing/events/power/clock_set_rate/enable
diff --git a/sdcard/sdcard.c b/sdcard/sdcard.c
index 13009aa..33b1509 100644
--- a/sdcard/sdcard.c
+++ b/sdcard/sdcard.c
@@ -507,6 +507,16 @@ static void derive_permissions_locked(struct fuse* fuse, struct node *parent,
     }
 }
 
+static void derive_permissions_recursive_locked(struct fuse* fuse, struct node *parent) {
+    struct node *node;
+    for (node = parent->child; node; node = node->next) {
+        derive_permissions_locked(fuse, parent, node);
+        if (node->child) {
+            derive_permissions_recursive_locked(fuse, node);
+        }
+    }
+}
+
 /* Kernel has already enforced everything we returned through
  * derive_permissions_locked(), so this is used to lock down access
  * even further, such as enforcing that apps hold sdcard_rw. */
@@ -1145,6 +1155,8 @@ static int handle_rename(struct fuse* fuse, struct fuse_handler* handler,
     res = rename_node_locked(child_node, new_name, new_actual_name);
     if (!res) {
         remove_node_from_parent_locked(child_node);
+        derive_permissions_locked(fuse, new_parent_node, child_node);
+        derive_permissions_recursive_locked(fuse, child_node);
         add_node_to_parent_locked(child_node, new_parent_node);
     }
     goto done;
@@ -1203,11 +1215,11 @@ static int handle_open(struct fuse* fuse, struct fuse_handler* handler,
     out.fh = ptr_to_id(h);
     out.open_flags = 0;
 
-    #ifdef FUSE_SHORTCIRCUIT
-        out.lower_fd = h->fd;
-    #else
-        out.padding = 0;
-    #endif
+#if defined(FUSE_STACKED_IO) || defined(FUSE_SHORTCIRCUIT)
+    out.lower_fd = h->fd;
+#else
+    out.padding = 0;
+#endif
 
     fuse_reply(fuse, hdr->unique, &out, sizeof(out));
     return NO_STATUS;
@@ -1373,11 +1385,11 @@ static int handle_opendir(struct fuse* fuse, struct fuse_handler* handler,
     out.fh = ptr_to_id(h);
     out.open_flags = 0;
 
-    #ifdef FUSE_SHORTCIRCUIT
-        out.lower_fd = -1;
-    #else
-        out.padding = 0;
-    #endif
+#if defined(FUSE_STACKED_IO) || defined(FUSE_SHORTCIRCUIT)
+    out.lower_fd = -1;
+#else
+    out.padding = 0;
+#endif
 
     fuse_reply(fuse, hdr->unique, &out, sizeof(out));
     return NO_STATUS;
@@ -1461,9 +1473,12 @@ static int handle_init(struct fuse* fuse, struct fuse_handler* handler,
     out.max_readahead = req->max_readahead;
     out.flags = FUSE_ATOMIC_O_TRUNC | FUSE_BIG_WRITES;
 
-    #ifdef FUSE_SHORTCIRCUIT
-        out.flags |= FUSE_SHORTCIRCUIT;
-    #endif
+#ifdef FUSE_SHORTCIRCUIT
+     out.flags |= FUSE_SHORTCIRCUIT;
+#endif
+#ifdef FUSE_STACKED_IO
+    out.flags |= FUSE_STACKED_IO;
+#endif
 
     out.max_background = 32;
     out.congestion_threshold = 32;
@@ -1680,6 +1695,10 @@ static int read_package_list(struct fuse_global* global) {
     TRACE("read_package_list: found %zu packages\n",
             hashmapSize(global->package_to_appid));
     fclose(file);
+
+    /* Regenerate ownership details using newly loaded mapping */
+    derive_permissions_recursive_locked(global->fuse_default, &global->root);
+
     pthread_mutex_unlock(&global->lock);
     return 0;
 }
author	Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de>	2016-03-18 10:16:43 +0100
committer	Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de>	2016-03-18 10:16:43 +0100
commit	7f2bb91aecbdc8af268c56f80034d8f3dc0204aa (patch)
tree	59506f762d56cb9a31856509496553707fcfd6a9
parent	6309ce85786ff96bd04f62dd5cb62a7b0c69f763 (diff)
parent	fb7e634f3d725728d65e0c32c5d20c99d91a158e (diff)
download	system_core-7f2bb91aecbdc8af268c56f80034d8f3dc0204aa.zip system_core-7f2bb91aecbdc8af268c56f80034d8f3dc0204aa.tar.gz system_core-7f2bb91aecbdc8af268c56f80034d8f3dc0204aa.tar.bz2