Closed b6a8896e-9cf9-4b57-bc69-ae8e91e85ef7 closed 10 years ago
Hi,
Plist files have actually three flavors : XML ones, binary ones, and now (starting from Mac OS X 10.7 Lion) json one. The plistlib.readPlist function can only read XML plist files and thus cannot read binary and json ones.
The binary format is open and described by Apple (http://opensource.apple.com/source/CF/CF-550/CFBinaryPList.c).
Here is the diff (from Python 2.7 implementation of plistlib) to transparently read both binary and json formats.
API of plistlib remains unchanged, since format detection is done by plistlib.readPlist. An InvalidFileException is raised in case of malformed binary file.
57,58c57 \< "Plist", "Data", "Dict", \< "InvalidFileException", ---
"Plist", "Data", "Dict"
64d62 \< import json 66d63 \< import os 68d64 \< import struct 81,89c77,78 \< header = pathOrFile.read(8) \< pathOrFile.seek(0) \< if header == '\<?xml ve' or header[2:] == '\<?xml ': #XML plist file, without or with BOM \< p = PlistParser() \< rootObject = p.parse(pathOrFile) \< elif header == 'bplist00': #binary plist file \< rootObject = readBinaryPlistFile(pathOrFile) \< else: #json plist file \< rootObject = json.load(pathOrFile) --- p = PlistParser() rootObject = p.parse(pathOrFile) 195,285d183 \< \< # timestamp 0 of binary plists corresponds to 1/1/2001 (year of Mac OS X 10.0), instead of 1/1/1970. \< MAC_OS_X_TIMEOFFSET = (31 365 + 8) 86400 \< \< class InvalidFileException(ValueError): \< def \_str(self): \< return "Invalid file" \< def __unicode(self): \< return "Invalid file" \< \< def readBinaryPlistFile(in_file): \< """ \< Read a binary plist file, following the description of the binary format: http://opensource.apple.com/source/CF/CF-550/CFBinaryPList.c \< Raise InvalidFileException in case of error, otherwise return the root object, as usual \< """ \< in_file.seek(-32, os.SEEK_END) \< trailer = in_file.read(32) \< if len(trailer) != 32: \< return InvalidFileException() \< offset_size, ref_size, num_objects, top_object, offset_table_offset = struct.unpack('>6xBB4xL4xL4xL', trailer) \< in_file.seek(offset_table_offset) \< object_offsets = [] \< offset_format = '>' + {1: 'B', 2: 'H', 4: 'L', 8: 'Q', }[offset_size] num_objects \< ref_format = {1: 'B', 2: 'H', 4: 'L', 8: 'Q', }[ref_size] \< int_format = {0: (1, '>B'), 1: (2, '>H'), 2: (4, '>L'), 3: (8, '>Q'), } \< object_offsets = struct.unpack(offset_format, in_file.read(offset_size num_objects)) \< def getSize(token_l): \< """ return the size of the next object.""" \< if token_l == 0xF: \< m = ord(in_file.read(1)) & 0x3 \< s, f = int_format[m] \< return struct.unpack(f, in_file.read(s))[0] \< return token_l \< def readNextObject(offset): \< """ read the object at offset. May recursively read sub-objects (content of an array/dict/set) """ \< in_file.seek(offset) \< token = in_file.read(1) \< token_h, token_l = ord(token) & 0xF0, ord(token) & 0x0F #high and low parts \< if token == '\x00': \< return None \< elif token == '\x08': \< return False \< elif token == '\x09': \< return True \< elif token == '\x0f': \< return '' \< elif token_h == 0x10: #int \< result = 0 \< for k in xrange((2 \<\< token_l) - 1): \< result = (result \<\< 8) + ord(in_file.read(1)) \< return result \< elif token_h == 0x20: #real \< if token_l == 2: \< return struct.unpack('>f', in_file.read(4))[0] \< elif token_l == 3: \< return struct.unpack('>d', in_file.read(8))[0] \< elif token_h == 0x30: #date \< f = struct.unpack('>d', in_file.read(8))[0] \< return datetime.datetime.utcfromtimestamp(f + MAC_OS_X_TIME_OFFSET) \< elif token_h == 0x80: #data \< s = getSize(token_l) \< return in_file.read(s) \< elif token_h == 0x50: #ascii string \< s = getSize(token_l) \< return in_file.read(s) \< elif token_h == 0x60: #unicode string \< s = getSize(token_l) \< return in_file.read(s 2).decode('utf-16be') \< elif token_h == 0x80: #uid \< return in_file.read(token_l + 1) \< elif token_h == 0xA0: #array \< s = getSize(token_l) \< obj_refs = struct.unpack('>' + ref_format s, in_file.read(s ref_size)) \< return map(lambda x: readNextObject(object_offsets[x]), obj_refs) \< elif token_h == 0xC0: #set \< s = getSize(token_l) \< obj_refs = struct.unpack('>' + ref_format s, in_file.read(s ref_size)) \< return set(map(lambda x: readNextObject(object_offsets[x]), obj_refs)) \< elif token_h == 0xD0: #dict \< result = {} \< s = getSize(token_l) \< key_refs = struct.unpack('>' + ref_format s, in_file.read(s ref_size)) \< obj_refs = struct.unpack('>' + ref_format s, in_file.read(s * ref_size)) \< for k, o in zip(key_refs, obj_refs): \< key = readNextObject(object_offsets[k]) \< obj = readNextObject(object_offsets[o]) \< result[key] = obj \< return result \< raise InvalidFileException() \< return readNextObject(object_offsets[top_object]) \<
Thanks for the patch. Could you upload it as a context diff?
Here is the new patch. I assumed that you meant to use diff -c instead of the raw diff command.
Hmm. Apparently what I meant was -u instead of -c (unified diff). I just use the 'hg diff' command myself, which does the right thing :) Of course, to do that you need to have a checkout. (We can probably use the context diff.)
This patch is for Python 2. New features are accepted only for Python 3.3+. I ported the patch, but since I have no Mac, I can't check.
To date code was specified incorrectly.
The length of integers was calculated incorrectly. To convert integers, you can use int.from_bytes.
Objects identity was not preserved.
I'm not sure that the recognition of XML done enough. Should consider UTF-16 and UTF-32 with the BOM and without.
Need tests.
Also I'm a bit cleaned up and modernizing the code. I believe that it should be rewritten in a more object-oriented style. It is also worth to implement writer.
storchaka > I'm trying to take care of your remarks. So, I'm working on a more object-oriented code, with both write and read functions. I just need to write some test cases. IMHO, we should add a new parameter to the writePlist function, to allow the use of the binary or the json format of plist files instead of the default XML one.
Keep it simple: if a few functions work, there is no need at all to add classes. Before doing more work though I suggest you wait for the feedback of the Mac maintainers.
I (as one of the Mac maintainers) like the new functionality, but would like to see some changes:
1) as others have noted it is odd that binary and json plists can be read but not written
2) there need to be tests, and I'd add two or even three set of tests:
a. tests that read pre-generated files in the various formats (tests that we're compatible with the format generated by Apple)
b. tests that use Apple tools to generated plists in various formats, and check that the library can read them (these tests would be skipped on platforms other than OSX)
c. if there are read and write functions: check that the writer generates files that can be read back in.
3) there is a new public function for reading binary plist files, I'd keep that private and add a "format" argument to readPlist when there is a need for forcing the usage of a specific format (and to mirror the (currently hypothetical) format argument for writePlist).
Don't worry about rearchitecturing plistlib, it might need work in that regard but that need not be part of this issue and makes it harder to review the changes. I'm also far from convinced that a redesign of the code is needed.
I'm working on a class, BinaryPlistParser, which allow to both read and write binary files.
I've also added a parameter fmt to writePlist and readPlist, to specify the format ('json', 'xml1' or 'binary1', using XML by default). These constants are used by Apple for its plutil program.
I'm now working on integrating these three formats to the test_plistlib.py. However, the json is less expressive than the other two, since it cannot handle dates.
Here is the new patch, allowing read and write binary, json and xml plist files.
It includes both the plistlib.py and test/test_plistlib.py patches. JSON format does not allow dates and data, so XML is used by default to write files. I use the json library to write JSON plist files, but its output is slightly different from the Apple default output: keys of dictionaries are in different order. Thus, I removed the test_appleformattingfromliteral test for JSON files.
Similarly, my binary writer does not write the same binary files as the Apple library: my library writes the content of compound objects (dicts, lists and sets) before the object itself, while Apple writes the object before its content. Copying the Apple behavior results in some additional weird lines of code, for little benefit. Thus, I also removed the test_appleformattingfromliteral test for binary files.
Other tests are made for all the three formats.
Hi,
I noticed in the latest message that d9pounces posted that "JSON format does not allow dates and data, so XML is used by default to write files.". Rthe XML version of plists also do not really 'support' those types, and they are converted as follows:
NSData -> Base64 encoded data NSDate -> ISO 8601 formatted string
(from http://en.wikipedia.org/wiki/Property_list#Mac_OS_X)
So really it should be the same thing when converting to json no?
The plutil (Apple's command-line tool to convert plist files from a format to another) returns an error if you try to convert a XML plist with dates to JSON.
Where are you even seeing these json property lists? I just checked the most recent documentation for NSPropertyListSerialization, and they have not updated the enum for NSPropertyListFormat. It seems that if even Apple doesn't support writing json property lists with their own apis then we shouldn't worry about supporting it?
enum { NSPropertyListOpenStepFormat = kCFPropertyListOpenStepFormat, NSPropertyListXMLFormat_v1_0 = kCFPropertyListXMLFormat_v1_0, NSPropertyListBinaryFormat_v1_0 = kCFPropertyListBinaryFormat_v1_0 }; NSPropertyListFormat; typedef NSUInteger NSPropertyListFormat;
plutil(1) supports writing json format.
That written, the opensource parts of CoreFoundation on opensource.apple.com don't support reading or writing json files.
I'm therefore -1 w.r.t. adding support for json formatted plist files, support for json can be added when Apple actually supports that it the system libraries and hence the format is stable.
are any more changes needed to the code that is already posted as a patch in this bug report? or are the changes you wanted to see happen in msg157669 not happen yet?
d9pouces: are you willing to sign a contributor agreement? The agreement is needed before we can add these changes to the stdlib, and I'd like to that for the 3.4 release.
More information on the contributor agreement: http://www.python.org/psf/contrib/contrib-form/
I just signed this agreement. Thanks for accepting this patch!
I've started work on integrating the latest patch.
Some notes (primarily for my own use):
I'll drop support for the JSON format, that format is not used by Apple's libraries (although the plutil tool can convert plists to JSON)
The patch (plistlib_with_tests.diff) no longer applies cleanly, there are two rejected hunks in test_plist.py.
There is no documentation update in the patch, I'm working on that
There is also an NextStep format (see \http://code.google.com/p/networkpx/wiki/PlistSpec\. That format is not supported by plutil, but is supported by Apple's libraries. Support for this can be added later.
plistlib needs futher work as well, it would be nice to move closer to the PEP-8 coding style (although this should be done carefully to maintain compatibility with existing code.
plistlib.Data should be deprecated. The class was needed in python 2.7, but python 3 already has a unambiguous representation for binary data: the bytes type.
It might be nice to add some options from the json library to the serialization function:
Issue bpo-11101 mentions that it would be nice to have an option to ignore keys whose value is None. I'm not sure if this is really useful, and if it is it might be better to add a "skip values" option that ignores items where the value cannot be encoded.
I've attached bpo-14455-v2.txt with an updated patch. The patch is still very much a work in progress, I haven't had as much time to work on this as I'd like.
This version:
Should apply cleanly to the tip of the default branch
Move around some code.
Doesn't pass unit tests (most likely because I've botched the manual merge). The unittests also don't cover the functionality I've added.
Adds documentation
Adds 'skipkeys' and 'sort_keys' to the write functions (with the same semantics as these keywords have with json.dump)
Adds 'data_as_bytes' to the read functions. Then this option is true binary data is returned as an instance of bytes instead of plistlib.Data The latter is still the default.
v3 is still a work in progress, and still fails some tests
Replaced test data by data generated by a helper script (to make it easier to update)
Use 'subtest' feature of unittest library in 3.4
Small tweaks to plist library (the dump/load/dumps/loads function at the end probably won't survive)
Updated link to the CFBinaryPlist.c source code (this should be a newer version of the file)
Added option to readPlist to pass in the dictionary type (defaults to plistlib._InternalDict). This is primarily useful for testing and debugging, but also mirrors the 'sortkeys' keyword argument for the writer (when the order can be important for writing the user should have some way to detect the order when reading).
The data generated by the binary plist generator does't match the data generated by Cocoa in OSX 10.8 (and generated by the helper script), I haven't fully debugged that problem yet. The generated binary plist and the Cocoa version can both be parsed by plistlib, and result in the same data structure
See also:
bpo-18168: request for the sort_keys option bpo-11101: request for an option to ignore 'None' values when writing bpo-9256: datetime.datetime objects created by plistlib don't include timezone information (and looking at the code I'd say that timezones are ignored when *writing* plist files as well) bpo-10733: Apple's plist can create malformed XML (control characters) than cannot be read by plistlib
The test failure I'm getting is caused by a difference in the order in which items are written to the archive. I'm working on a fix.
v4 passes the included tests.
The testsuite isn't finished yet.
The fifth version of the patch should be much cleaner.
Changes:
Coding style cleanup, the new code uses PEP-8 conformant names for methods and variables.
Explicitly make private classes private by prefixing their name with an underscore (including the old XML parser/generator classes)
Remove support in the binary plist code for sets and uuids, neither can be written to plist files by Apple's code.
More tests
There is no support for JSON because JSON is not supported by Apple's propertylist APIs either. The command-line tool plutil does support JSON output, but the C and Objective-C APIs do not.
There is no support for the old OpenStep format. That format is badly documented, has been deprecated for a long time, and writing it is not supported by Apple's plist libraries. The OpenStep format also seems to be much more limited than the two modern ones.
Open issues:
The patch contains plistlib.{dump,dumps,load,loads} which mirror the functions with same name from the pickle and json modules.
Is is usefull to add those functions to plistlib, and deprecate the older functions?
Advantages: + Cleaner API + Provides a clean path to remove plistlib.Data and plistlib._InternalDict (the latter is already deprecated)
Disadvantages:
Is renaming PlistParser and PlistWriter ok? Both are private and a quick search on google seems to indicate that nobody directory uses these classes.
If renaming is ok, should methods/variables be renamed to PEP-8 style?
Should a 'default' keyword be added to the serialization functions (simular to the keyword in json.dump)
I don't have a usecase for this, the only reason to add is is consistency with the json module
Should a 'skipvalues' keyword be added to the serialization functions (to ignore values that cannot be serialized, see bpo-11101)
I'm not convinced that this would be a good idea.
Should a 'check_circular' keyword be added to the serialization functions (again similar to the same keyword for json.dump)?
This would avoid relying on the recursion limit to break infinite loops when serializing circular datastructures.
Would need to check if binary plist can contain circular data structures when they are written using Apple's libraries.
I intend to commit my latest version of the patch during the europython sprints, with a minor change: don't include dump(s) and load(s), that change (and the other items on "open issues" in my last post) can be addressed later.
Let me review your patch.
Any review would be greatly appreciated. One thing I'm not too happy about is the use of magic numbers in the binary plist support code, but I think that using constants or a dispatch table would not make the code any clearer.
I have added comments on Rietveld.
I have to apologize for unwitting misleading of d9pouces. Functional version of the patch is enough Pythonic and it looks more clear to me than object-oriented one.
I've attached a slightly updated version of the patch. The documentation now lists dump and load as the primary API, with the old API as a deprecated alternative.
The code hasn't been changed to relect this yet, but does contain a number of tweaks and bugfixes (and a new testcase that ensures that decoding UTF-16 and UTF-32 files actually works, after testing that those encodings are supported by Apple's tools).
NOTE: no review of this version is needed, I'm mostly posting as backup.
This version should be better:
There should be no lines longer than 80 characters
Changed coding style of the (private) XML plist writer classes to PEP-8
Public API is now dump/dumps and load/loads, the old API is still available and deprecated
-> As mentioned before I'm not entirely sure if doing this is the right solution, but it does give us a clean path to remove deprecated functionality later on (in particular the Dict and Data classes). -> What I haven't done yet, and probably should, is to write a 2to3 fixer that converts code to the new API.
Grouped source in more logical segments (deprecated, xml support, binary support, generic code).
The previous item means that I've moved even more code around, I looked into minimizing the patch but the original module could not be easily extended without moving code around.
Added some more tests -> I might add more tests when I manage to run coverage.py, for some reason the instructions in the dev-guide don't work for me with a framework install :-(
I might add a documentation comment to the binary plist support code that gives an overview of the file format with pointers to more information, but other than that and possible test coverage improvements the patch should be done.
Updated test-data generator: it now encodes the data using base64, to make it easier to generate a file with limited line lengths.
v8 of the patch contains 1 change from v7: the test data is encoded in base64. This was primarily done to ensure that the file has usable line lengths. A nice side effect is that it is now harder than ever to manually change the test data, as the comment mentions there is a script for generating that data.
As always I'd appreciate feedback on this patch, especially on deprecating the current public API and introducing a new (PEP-8 compliant) one.
Ronald, I think v8 of the patch is missing (and plistlib_generate_testdata.py was uploaded twice).
Actually attach the latest version of the patch.
I'd really like to include this patch in 3.4, but haven't managed to do any opensource work in the previous period and don't know when I'll be able to actually commit this (and more importantly, be available when issues crop up) :-(
I added a lot of comments on Rietveld.
I for the most part agree with the comments and will provide an updated patch on thursday. Would you mind if I committed that without further review (due to cutting it awfully close to the deadline for beta 1)?
Some comments I want to reply to specifically:
"Can the code be simpler, with only one pass?"
Maybe, but not right now.
"This is inconsistent with _flatten()."
I'll add a comment that explains why this is: _flatten (and this code) can deal with arbitrary keys, but that is not supported by Apple's code.
The type check in _write_object ensures that it is not possible to write archives that cannot be read back by Apple's Cocoa frameworks.
"unusual indentation" (several times)
I'll have to look at other stdlib code to find suitable indentation, this is indentation I've used in my code for a long time (I've also used camelCase instead of pep8_style names for methods for a long time, which is probably why I never noticed that I forgot to convert some method name when cleaning up the naming conventions used in this module).
It's too large and complicated patch. I would like to have a chance to quick review it before committing. You will have time to commit.
I've attached an updated version of the patch that should fix most of the issues found during review.
I've also changed the two FMT_ constants to an enum.Enum (but still expose the constants themselves as module global names because that's IMHO more convenient).
FYI I'm completely away from the computer during the weekend and will have very limited time to work from later today (18:00 CET).
I have added a few comments on Rietveld. Besides formatting nitpicks your have forgot third argument in new warns and missed some details in tests. As for the rest the patch LGTM. If you have no time I will fixed this minor issues and will commited the patch.
I'm not sure about docstrings text ("return" vs "returns", I don't remember what is better), but we can bikeshed it after beta 1.
Updated patch after next round of reviews.
New changeset 673ca119dbd0 by Ronald Oussoren in branch 'default': Issue bpo-14455: plistlib now supports binary plists and has an updated API. http://hg.python.org/cpython/rev/673ca119dbd0
New changeset 602e0a0ec67e by Ned Deily in branch 'default': Issue bpo-14455: Fix maybe_open typo in Plist.fromFile(). http://hg.python.org/cpython/rev/602e0a0ec67e
These changes are worth to mention in What's News.
"versionchanged" below writePlistToBytes() is wrong. Perhaps below dump() too. "versionadded" is needed for new functions: dump(), dumps(), load(), loads().
Currently negative integers are not supported in binary format. Here is a patch which adds support for negative integers and large integers up to 128 bit.
oops... thanks for the patch. I'll review later this week, in particular the 128 bit integer support because I don't know if Apple's libraries support those.
According to [1] Apple's libraries write any signed 128-bit integers, but read only integers from -2**63 to 2**64-1 (e.g. signed and unsigned 64-bit integers).
[1] http://opensource.apple.com/source/CF/CF-550/CFBinaryPList.c
Yet one nitpick. Perhaps _write_object() should raise TypeError instead of InvalidFileException.
I'm working on an update for your patch that addresses these comments:
I don't like supporting 128 bit integers because Apple's public APIs don't support those values. That is, the value 'kCFNumberSInt128Type' is not in a public header for the OSX 10.9 SDK.
The test_int method that you introduced tests conversions to and from XML, and doesn't test the problem with negative values in binary plists.
I don't understand why test_int converts a value to a binary plist twice, that is the 'data2 = plistlib.dumps(pl2)' bit.
I'm adding negative integers to the _create method as well, with the corresponding changes to the binary TESTDATA dictionary.
I don't understand your comment about the writePlistToBytes documentation, there was no versionchanged in the documentation. The version changed for dump was wrong, that should be versionadded (and the other new functions should have a versionadded as well)
I agree that this change should be mentioned in What's New.
I agree that _write_object should raise TypeError
BTW. What about out-of-range integer values? Those currently raise struct.error, I'd prefer to raise TypeError instead because the use of the struct module should be an implementation detail.
And a final question: integers with '2 63 \<= value \< 2 64' (e.g. values that are in the range of uint64_t but not in the range of int64_t) can be written to a binary plist, but will be read back as a negative value (which is the same behavior as in Apple's code). Should we warn about this in the documentation?
I'll post an updated patch later today.
Note: these values reflect the state of the issue at the time it was migrated and might not reflect the current state.
Show more details
GitHub fields: ```python assignee = 'https://github.com/ronaldoussoren' closed_at =
created_at =
labels = ['OS-mac', 'type-feature', 'library']
title = 'plistlib unable to read json and binary plist files'
updated_at =
user = 'https://bugs.python.org/d9pouces'
```
bugs.python.org fields:
```python
activity =
actor = 'python-dev'
assignee = 'ronaldoussoren'
closed = True
closed_date =
closer = 'ronaldoussoren'
components = ['Library (Lib)', 'macOS']
creation =
creator = 'd9pouces'
dependencies = []
files = ['25075', '25076', '25077', '25156', '30523', '30525', '30530', '30541', '30740', '30749', '30790', '30791', '30874', '32752', '32754', '32937', '33204', '33206', '33207', '33213', '33481']
hgrepos = []
issue_num = 14455
keywords = ['patch', 'needs review']
message_count = 67.0
messages = ['157152', '157154', '157155', '157159', '157166', '157506', '157668', '157669', '157687', '157781', '164620', '165438', '168974', '169000', '169100', '185734', '185736', '189917', '190895', '190902', '190903', '190913', '190922', '190950', '191988', '192000', '192037', '192045', '192129', '192183', '192385', '192386', '192689', '192721', '198952', '203211', '203418', '203419', '203612', '203628', '203629', '203630', '203633', '203723', '205012', '205018', '205026', '205033', '205035', '206592', '206594', '206595', '206600', '206601', '206615', '206619', '208153', '208158', '208159', '208161', '208162', '208163', '208164', '208165', '210372', '210373', '212970']
nosy_count = 9.0
nosy_names = ['ronaldoussoren', 'ned.deily', 'eric.araujo', 'r.david.murray', 'jrjsmrtn', 'python-dev', 'serhiy.storchaka', 'd9pouces', 'markgrandi']
pr_nums = []
priority = 'normal'
resolution = 'fixed'
stage = 'resolved'
status = 'closed'
superseder = None
type = 'enhancement'
url = 'https://bugs.python.org/issue14455'
versions = ['Python 3.4']
```