Updated *.pak file format to support both UTF8 and UTF16

Inserted a new field in the header that specifies which encoding is to be used for the text resources. I also upped file format to version 4. BUG=76281 TEST=unit_tests Review URL: http://codereview.chromium.org/7744017 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@100973 0039d316-1c4b-4281-b951-d872f2087c98
author: adriansc@chromium.org <adriansc@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-13 21:45:17 +0000
committer: adriansc@chromium.org <adriansc@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-13 21:45:17 +0000
commit: c13b1e74736c5c65500abf4dfff5ad19ec4383e4 (patch)
tree: 52e8dcb292f0129a74a7951877f036aeca067a00 /tools/grit
parent: a861db15107dec307b9246678101f26238055a3c (diff)
download: chromium_src-c13b1e74736c5c65500abf4dfff5ad19ec4383e4.zip
chromium_src-c13b1e74736c5c65500abf4dfff5ad19ec4383e4.tar.gz
chromium_src-c13b1e74736c5c65500abf4dfff5ad19ec4383e4.tar.bz2
4 files changed, 65 insertions, 35 deletions
diff --git a/tools/grit/grit/format/data_pack.py b/tools/grit/grit/format/data_pack.py
index 01c0c9e..87db064 100755
--- a/tools/grit/grit/format/data_pack.py
+++ b/tools/grit/grit/format/data_pack.py
@@ -19,12 +19,19 @@ from grit.node import message
 from grit.node import misc
 
 
-FILE_FORMAT_VERSION = 3
-HEADER_LENGTH = 2 * 4  # Two uint32s. (file version and number of entries)
+PACK_FILE_VERSION = 4
+HEADER_LENGTH = 2 * 4 + 1  # Two uint32s. (file version, number of entries) and
+                           # one uint8 (encoding of text resources)
+BINARY, UTF8, UTF16 = range(3)
 
 class WrongFileVersion(Exception):
   pass
 
+class DataPackContents:
+  def __init__(self, resources, encoding):
+    self.resources = resources
+    self.encoding = encoding
+
 class DataPack(interface.ItemFormatter):
   '''Writes out the data pack file format (platform agnostic resource file).'''
   def Format(self, item, lang='en', begin_item=True, output_dir='.'):
@@ -36,9 +43,9 @@ class DataPack(interface.ItemFormatter):
     nodes = DataPack.GetDataNodes(item)
     data = {}
     for node in nodes:
-      id, value = node.GetDataPackPair(lang)
+      id, value = node.GetDataPackPair(lang, UTF8)
       data[id] = value
-    return DataPack.WriteDataPackToString(data)
+    return DataPack.WriteDataPackToString(data, UTF8)
 
   @staticmethod
   def GetDataNodes(item):
@@ -63,13 +70,15 @@ class DataPack(interface.ItemFormatter):
     original_data = data
 
     # Read the header.
-    version, num_entries = struct.unpack("<II", data[:HEADER_LENGTH])
-    if version != FILE_FORMAT_VERSION:
+    version, num_entries, encoding = struct.unpack("<IIB",
+                                                   data[:HEADER_LENGTH])
+    if version != PACK_FILE_VERSION:
+      print "Wrong file version in ", input_file
       raise WrongFileVersion
 
     resources = {}
     if num_entries == 0:
-      return resources
+      return DataPackContents(resources, encoding)
 
     # Read the index and data.
     data = data[HEADER_LENGTH:]
@@ -80,18 +89,18 @@ class DataPack(interface.ItemFormatter):
       next_id, next_offset = struct.unpack("<HI", data[:kIndexEntrySize])
       resources[id] = original_data[offset:next_offset]
 
-    return resources
+    return DataPackContents(resources, encoding)
 
   @staticmethod
-  def WriteDataPackToString(resources):
+  def WriteDataPackToString(resources, encoding):
     """Write a map of id=>data into a string in the data pack format and return
     it."""
     ids = sorted(resources.keys())
     ret = []
 
     # Write file header.
-    ret.append(struct.pack("<II", FILE_FORMAT_VERSION, len(ids)))
-    HEADER_LENGTH = 2 * 4             # Two uint32s.
+    ret.append(struct.pack("<IIB", PACK_FILE_VERSION, len(ids), encoding))
+    HEADER_LENGTH = 2 * 4 + 1            # Two uint32s and one uint8.
 
     # Each entry is a uint16 + a uint32s. We have one extra entry for the last
     # item.
@@ -111,10 +120,10 @@ class DataPack(interface.ItemFormatter):
     return ''.join(ret)
 
   @staticmethod
-  def WriteDataPack(resources, output_file):
+  def WriteDataPack(resources, output_file, encoding):
     """Write a map of id=>data into output_file as a data pack."""
     file = open(output_file, "wb")
-    content = DataPack.WriteDataPackToString(resources)
+    content = DataPack.WriteDataPackToString(resources, encoding)
     file.write(content)
 
   @staticmethod
@@ -122,25 +131,37 @@ class DataPack(interface.ItemFormatter):
     """Write a new data pack to |output_file| based on a list of filenames
     (|input_files|)"""
     resources = {}
+    encoding = None
     for filename in input_files:
-      new_resources = DataPack.ReadDataPack(filename)
+      new_content = DataPack.ReadDataPack(filename)
 
-      # Make sure we have no duplicates.
-      duplicate_keys = set(new_resources.keys()) & set(resources.keys())
+      # Make sure we have no dups.
+      duplicate_keys = set(new_content.resources.keys()) & set(resources.keys())
       if len(duplicate_keys) != 0:
         raise exceptions.KeyError("Duplicate keys: " +
                                   str(list(duplicate_keys)))
 
-      resources.update(new_resources)
+      # Make sure encoding is consistent.
+      if encoding in (None, BINARY):
+        encoding = new_content.encoding
+      elif new_content.encoding not in (BINARY, encoding):
+          raise exceptions.KeyError("Inconsistent encodings: " +
+                                    str(encoding) + " vs " +
+                                    str(new_content.encoding))
+
+      resources.update(new_content.resources)
 
-    DataPack.WriteDataPack(resources, output_file)
+    # Encoding is 0 for BINARY, 1 for UTF8 and 2 for UTF16
+    if encoding is None:
+      encoding = BINARY
+    DataPack.WriteDataPack(resources, output_file, encoding)
 
 def main():
   # Just write a simple file.
   data = { 1: "", 4: "this is id 4", 6: "this is id 6", 10: "" }
-  WriteDataPack(data, "datapack1.pak")
+  DataPack.WriteDataPack(data, "datapack1.pak", UTF8)
   data2 = { 1000: "test", 5: "five" }
-  WriteDataPack(data2, "datapack2.pak")
+  DataPack.WriteDataPack(data2, "datapack2.pak", UTF8)
   print "wrote datapack1 and datapack2 to current directory."
 
 if __name__ == '__main__':
diff --git a/tools/grit/grit/format/data_pack_unittest.py b/tools/grit/grit/format/data_pack_unittest.py
index 35966639..8de54ef 100644
--- a/tools/grit/grit/format/data_pack_unittest.py
+++ b/tools/grit/grit/format/data_pack_unittest.py
@@ -16,15 +16,17 @@ from grit.format import data_pack
 class FormatDataPackUnittest(unittest.TestCase):
   def testWriteDataPack(self):
     expected = (
-        '\x03\x00\x00\x00\x04\x00\x00\x00'  # header (version, no. entries)
-        '\x01\x00\x26\x00\x00\x00'          # index entry 1
-        '\x04\x00\x26\x00\x00\x00'          # index entry 4
-        '\x06\x00\x32\x00\x00\x00'          # index entry 6
-        '\x0a\x00\x3e\x00\x00\x00'          # index entry 10
-        '\x00\x00\x3e\x00\x00\x00'          # extra entry for the size of last
+        '\x04\x00\x00\x00'                  # header(version
+        '\x04\x00\x00\x00'                  #        no. entries,
+        '\x01'                              #        encoding)
+        '\x01\x00\x27\x00\x00\x00'          # index entry 1
+        '\x04\x00\x27\x00\x00\x00'          # index entry 4
+        '\x06\x00\x33\x00\x00\x00'          # index entry 6
+        '\x0a\x00\x3f\x00\x00\x00'          # index entry 10
+        '\x00\x00\x3f\x00\x00\x00'          # extra entry for the size of last
         'this is id 4this is id 6')         # data
     input = { 1: "", 4: "this is id 4", 6: "this is id 6", 10: "" }
-    output = data_pack.DataPack.WriteDataPackToString(input)
+    output = data_pack.DataPack.WriteDataPackToString(input, data_pack.UTF8)
     self.failUnless(output == expected)
 
 
diff --git a/tools/grit/grit/node/include.py b/tools/grit/grit/node/include.py
index b073bc7..dc29315 100644
--- a/tools/grit/grit/node/include.py
+++ b/tools/grit/grit/node/include.py
@@ -74,7 +74,7 @@ class IncludeNode(base.Node):
     '''
     return self.FilenameToOpen()
 
-  def GetDataPackPair(self, lang):
+  def GetDataPackPair(self, lang, encoding):
     '''Returns a (id, string) pair that represents the resource id and raw
     bytes of the data.  This is used to generate the data pack data file.
     '''
@@ -90,6 +90,8 @@ class IncludeNode(base.Node):
       data = infile.read()
       infile.close()
 
+    # Include does not care about the encoding, because it only returns binary
+    # data.
     return id, data
 
   def Flatten(self, output_dir):
@@ -136,4 +138,3 @@ class IncludeNode(base.Node):
     node.EndParsing()
     return node
   Construct = staticmethod(Construct)
-
diff --git a/tools/grit/grit/node/message.py b/tools/grit/grit/node/message.py
index a48a645..3c5ac64 100644
--- a/tools/grit/grit/node/message.py
+++ b/tools/grit/grit/node/message.py
@@ -1,5 +1,5 @@
 #!/usr/bin/python2.4
-# Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+# Copyright (c) 2011 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
@@ -19,6 +19,7 @@ from grit import exception
 from grit import tclib
 from grit import util
 
+BINARY, UTF8, UTF16 = range(3)
 
 # Finds whitespace at the start and end of a string which can be multiline.
 _WHITESPACE = re.compile('(?P<start>\s*)(?P<body>.+?)(?P<end>\s*)\Z',
@@ -186,7 +187,7 @@ class MessageNode(base.ContentNode):
     else:
       return self.attrs['offset']
 
-  def GetDataPackPair(self, lang):
+  def GetDataPackPair(self, lang, encoding):
     '''Returns a (id, string) pair that represents the string id and the string
     in utf8.  This is used to generate the data pack data file.
     '''
@@ -199,10 +200,15 @@ class MessageNode(base.ContentNode):
       # Windows automatically translates \n to a new line, but GTK+ doesn't.
       # Manually do the conversion here rather than at run time.
       message = message.replace("\\n", "\n")
-    # |message| is a python unicode string, so convert to a utf16 byte stream
-    # because that's the format of datapacks.  We skip the first 2 bytes
-    # because it is the BOM.
-    return id, message.encode('utf16')[2:]
+    # |message| is a python unicode string, so convert to a byte stream that
+    # has the correct encoding requested for the datapacks. We skip the first
+    # 2 bytes of text resources because it is the BOM.
+    if encoding == UTF8:
+      return id, message.encode('utf8')
+    if encoding == UTF16:
+      return id, message.encode('utf16')[2:]
+    # Default is BINARY
+    return id, message
 
   # static method
   def Construct(parent, message, name, desc='', meaning='', translateable=True):
author	adriansc@chromium.org <adriansc@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-13 21:45:17 +0000
committer	adriansc@chromium.org <adriansc@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-13 21:45:17 +0000
commit	c13b1e74736c5c65500abf4dfff5ad19ec4383e4 (patch)
tree	52e8dcb292f0129a74a7951877f036aeca067a00 /tools/grit
parent	a861db15107dec307b9246678101f26238055a3c (diff)
download	chromium_src-c13b1e74736c5c65500abf4dfff5ad19ec4383e4.zip chromium_src-c13b1e74736c5c65500abf4dfff5ad19ec4383e4.tar.gz chromium_src-c13b1e74736c5c65500abf4dfff5ad19ec4383e4.tar.bz2