diff --git a/CHANGELOG.md b/CHANGELOG.md index 46914a4d1ca72c021af82a4001914664aeefd8be..d5e1b9e1aace7453b16afd13546fd834129e428a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,4 +3,4 @@ here's only the changelog for the version in development # v1.2 - +* removed the limit for string length and list and object size diff --git a/minijson/routines.pyx b/minijson/routines.pyx index 06e4616fc5911d63e4852db7b7eaa0f2eb8aaf48..dc015d5ad3490b53ca93eb3c734af0233ae88e9f 100644 --- a/minijson/routines.pyx +++ b/minijson/routines.pyx @@ -170,6 +170,28 @@ cpdef tuple parse(bytes data, int starting_position): e_dict = {} offset, e_dict = parse_dict(data, elements, starting_position+2) return offset+2, e_dict + elif value_type == 13: + string_length, = STRUCT_H.unpack(data[starting_position+1:starting_position+3]) + return 3+string_length, data[starting_position+2:starting_position+string_length+1].decode('utf-8') + elif value_type == 14: + string_length, = STRUCT_L.unpack(data[starting_position+1:starting_position+5]) + return 5+string_length, data[starting_position+5:starting_position+string_length+5].decode('utf-8') + elif value_type == 15: + elements, = STRUCT_H.unpack(data[starting_position+1:starting_position+3]) + offset, e_list = parse_list(data, elements, starting_position+3) + return 3+offset, e_list + elif value_type == 16: + elements, = STRUCT_L.unpack(data[starting_position+1:starting_position+5]) + offset, e_list = parse_list(data, elements, starting_position+5) + return 5+offset, e_list + elif value_type == 17: + elements, = STRUCT_H.unpack(data[starting_position+1:starting_position+3]) + offset, e_dict = parse_dict(data, elements, starting_position+3) + return offset+3, e_dict + elif value_type == 18: + elements, = STRUCT_L.unpack(data[starting_position+1:starting_position+5]) + offset, e_dict = parse_dict(data, elements, starting_position+5) + return offset+5, e_dict raise DecodingError('Unknown sequence type %s!' % (value_type, )) except IndexError as e: raise DecodingError('String too short!') from e @@ -202,16 +224,24 @@ cpdef int dump(object data, cio: io.BytesIO) except -1: return 1 elif isinstance(data, str): length = len(data) - if length > 255: - raise EncodingError('Cannot encode string longer than 255 characters') if length < 128: cio.write(bytearray([0x80 | length])) cio.write(data.encode('utf-8')) return 1+length - else: + elif length < 255: cio.write(bytearray([0, length])) cio.write(data.encode('utf-8')) return 2+length + elif length < 65535: + cio.write(b'\x0D') + cio.write(STRUCT_H.pack(length)) + cio.write(data.encode('utf-8')) + elif length < 0xFFFFFFFF: + cio.write(b'\x0E') + cio.write(STRUCT_L.pack(length)) + cio.write(data.encode('utf-8')) + else: + raise EncodingError('String is too long!') elif isinstance(data, int): if -128 <= data <= 127: # signed char, type 3 cio.write(b'\x03') @@ -253,31 +283,46 @@ cpdef int dump(object data, cio: io.BytesIO) except -1: return 9 elif isinstance(data, (tuple, list)): length = len(data) - if length > 255: - raise EncodingError('Too long of a list, maximum list length is 255') if length < 16: cio.write(bytearray([0b01000000 | length])) length = 1 - else: + elif length < 256: cio.write(bytearray([7, length])) length = 2 + elif length < 65536: + cio.write(b'\x0F') + cio.write(STRUCT_H.pack(length)) + length = 3 + elif length <= 0xFFFFFFFF: + cio.write(b'\x10') + cio.write(STRUCT_L.pack(length)) + length = 5 for elem in data: length += dump(elem, cio) return length elif isinstance(data, dict): length = len(data) - if length > 255: - raise EncodingError('Too long of a dict, maximum dict length is 255') if length < 16: cio.write(bytearray([0b01010000 | length])) length = 1 - else: + elif length < 256: cio.write(bytearray([11, len(data)])) length = 2 - for field_name, elem in data.items(): - cio.write(bytearray([len(field_name)])) - cio.write(field_name.encode('utf-8')) - length += dump(elem, cio) + elif length < 65536: + cio.write(b'\x11') + cio.write(STRUCT_H.pack(length)) + length = 3 + elif length <= 0xFFFFFFFF: + cio.write(b'\x12') + cio.write(STRUCT_L.pack(length)) + length = 5 + try: + for field_name, elem in data.items(): + cio.write(bytearray([len(field_name)])) + cio.write(field_name.encode('utf-8')) + length += dump(elem, cio) + except TypeError as e: + raise EncodingError('Keys have to be strings!') from e return length else: raise EncodingError('Unknown value type %s' % (data, )) diff --git a/setup.py b/setup.py index c575d728799b242c337ce23f9301ecf0972e6356..682be46413c344a93413830f14a86808e8ad7540 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ if 'DEBUG' in os.environ: directives['embedsignature'] = True -setup(version='1.2a1', +setup(version='1.2', packages=find_packages(include=['minijson', 'minijson.*']), ext_modules=build([Multibuild('minijson', find_pyx('minijson'), dont_snakehouse=dont_snakehouse), ], diff --git a/specification.md b/specification.md index d6e57d07d36124cfc657036aeaee0f9d2b0598e4..9b982fd9f9f2346f2fd29000247a88d00116d289 100644 --- a/specification.md +++ b/specification.md @@ -4,9 +4,7 @@ MiniJSON specification MiniJSON is a binary encoding for a subset of JSON that: * has no keys longer than 255 bytes UTF-8 -* has no string values longer than 255 bytes UTF-8 -* has no lists longer than 255 elements -* has no dicts longer than 255 elements +* all keys are string MiniJSON is bigger endian. @@ -40,3 +38,16 @@ with len of (value & 0x7F) followed by field name in UTF-8, and then goes the Value of the element * If value is 12, then next data is unsigned int24 +* If value is 13, then next data is an unsigned short representing the count + of characters, and then these characters follow and are + interpreted as a UTF-8 string +* If value is 14, then next data is an unsigned int representing the count + of characters, and then these characters follow and are +* If value is 15, then next data is a unsigned short, + and then a list follows of that many elements +* If value is 16, then next data is a unsigned int, + and then a list follows of that many elements +* If value is 17, then next data is a unsigned short, + and then an object follows of that many elements +* If value is 18, then next data is a unsigned int, + and then an object follows of that many elements diff --git a/tests/test_minijson.py b/tests/test_minijson.py index 4319416de4374a878c1add9a3fb0b37ce3dca939..9ed15b82f9972cc1cfcea67732d95dd5e647215e 100644 --- a/tests/test_minijson.py +++ b/tests/test_minijson.py @@ -4,6 +4,16 @@ from minijson import dumps, loads, dumps_object, loads_object, EncodingError, De class TestMiniJSON(unittest.TestCase): + def test_string(self): + a = 'test' + b = 't'*128 + c = 't'*65535 + d = 't'*128342 + self.assertEqual(loads(dumps(a)), a) + self.assertEqual(loads(dumps(b)), b) + self.assertEqual(loads(dumps(c)), c) + self.assertEqual(loads(dumps(d)), d) + def test_lists(self): a = [1, 2, 3] b = dumps(a) @@ -11,7 +21,7 @@ class TestMiniJSON(unittest.TestCase): self.assertEqual(a, c) a = [None]*256 - self.assertRaises(EncodingError, lambda: dumps(a)) + self.assertEqual(loads(dumps(a)), a) def test_long_lists(self): a = [None]*17 @@ -28,15 +38,23 @@ class TestMiniJSON(unittest.TestCase): c = loads(b) self.assertEqual(a, c) - def test_exceptions(self): + def test_long_dicts_and_lists(self): a = {} for i in range(65535): - a[i] = i*2 - self.assertRaises(EncodingError, lambda: dumps(a)) + a[str(i)] = i*2 + self.assertEqual(loads(dumps(a)), a) + a = {} + for i in range(0xFFFFF): + a[str(i)] = i*2 + self.assertEqual(loads(dumps(a)), a) a = [] for i in range(65535): a.append(i) - self.assertRaises(EncodingError, lambda: dumps(a)) + self.assertEqual(loads(dumps(a)), a) + a = [] + for i in range(65530): + a.append(i*2) + self.assertEqual(loads(dumps(a)), a) def test_dumps(self): v = {"name": "land", "operator_id": "dupa", "parameters": @@ -46,7 +64,7 @@ class TestMiniJSON(unittest.TestCase): self.assertEqual(v, c) def test_loads_exception(self): - b = b'\x1A' + b = b'\x1F' self.assertRaises(DecodingError, lambda: loads(b)) def test_loads(self):