From da5f6dc6486aec9e86ab9d87d790883203540e35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ma=C5=9Blanka?= <piotr.maslanka@henrietta.com.pl>
Date: Wed, 26 May 2021 15:29:52 +0200
Subject: [PATCH] object keys don't have to be strings anymore

---
 CHANGELOG.md           |  2 +
 docs/index.rst         |  7 ++++
 docs/specification.rst |  7 ++--
 minijson/routines.pyx  | 87 ++++++++++++++++++++++++++++++++----------
 setup.py               |  2 +-
 tests/test_minijson.py |  7 +++-
 6 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b73008e..0d9e211 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,3 +2,5 @@ Changelog is kept at [GitHub](https://github.com/Dronehub/minijson/releases),
 here's only the changelog for the version in development
 
 # v1.3
+
+* object keys don't have to be strings anymore
diff --git a/docs/index.rst b/docs/index.rst
index 6b4552d..0c4df87 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,6 +13,13 @@ Welcome to MiniJSON's documentation!
    usage
    specification
 
+MiniJSON is a space-aware binary format for representing arbitary JSON.
+It's however most efficient when dealing with short (less than 16 elements) lists and objects,
+whose all keys are strings.
+
+You should avoid objects with keys different than strings, since they will always use a
+4-byte length field. This is to be improved in a future release.
+
 Indices and tables
 ==================
 
diff --git a/docs/specification.rst b/docs/specification.rst
index 2eb4a9e..f18f4ad 100644
--- a/docs/specification.rst
+++ b/docs/specification.rst
@@ -1,10 +1,7 @@
 MiniJSON specification
 ======================
 
-MiniJSON is a binary encoding for a subset of JSON that:
-
-* has no keys longer than 255 bytes UTF-8
-* all keys are string
+MiniJSON is a space-aware binary encoding for a JSON.
 
 All data is stored as bigger endian.
 
@@ -56,3 +53,5 @@ with len of (value & 0x7F)
   and then an object follows of that many elements
 * If value is 18, then next data is a unsigned int,
   and then an object follows of that many elements
+* If value is 19, then next data is a unsigned int,
+    and then follow that many pairs of Values (key: value)
diff --git a/minijson/routines.pyx b/minijson/routines.pyx
index dc015d5..166dc98 100644
--- a/minijson/routines.pyx
+++ b/minijson/routines.pyx
@@ -81,6 +81,38 @@ cdef inline tuple parse_dict(bytes data, int elem_count, int starting_position):
         dct[s_field_name] = elem
     return offset, dct
 
+cdef inline tuple parse_sdict(bytes data, int elem_count, int starting_position):
+    """
+    Parse a sdict (with keys that are not strings) with this many elements
+    
+    :param data: data to parse as a list
+    :param elem_count: count of elements 
+    :param starting_position: starting position
+
+    :return: tuple of (how many bytes were there in the list, the dict itself)
+    """
+    cdef:
+        dict dct = {}
+        bytes b_field_name
+        str s_field_name
+        int i, ofs, offset = 0
+    for i in range(elem_count):
+        ofs, key = parse(data, starting_position+offset)
+        offset += ofs
+        ofs, elem = parse(data, starting_position+offset)
+        offset += ofs
+        dct[key] = elem
+    return offset, dct
+
+
+cdef bint can_be_encoded_as_a_dict(dict dct):
+    for key, value in dct.items():
+        if not isinstance(key, str):
+            return False
+        if len(key) > 255:
+            return False
+    return True
+
 
 cpdef tuple parse(bytes data, int starting_position):
     """
@@ -192,6 +224,10 @@ cpdef tuple parse(bytes data, int starting_position):
             elements, = STRUCT_L.unpack(data[starting_position+1:starting_position+5])
             offset, e_dict = parse_dict(data, elements, starting_position+5)
             return offset+5, e_dict
+        elif value_type == 19:
+            elements, = STRUCT_L.unpack(data[starting_position+1:starting_position+5])
+            offset, e_dict = parse_sdict(data, elements, starting_position+5)
+            return offset+5, e_dict
         raise DecodingError('Unknown sequence type %s!' % (value_type, ))
     except IndexError as e:
         raise DecodingError('String too short!') from e
@@ -302,28 +338,37 @@ cpdef int dump(object data, cio: io.BytesIO) except -1:
         return length
     elif isinstance(data, dict):
         length = len(data)
-        if length < 16:
-            cio.write(bytearray([0b01010000 | length]))
-            length = 1
-        elif length < 256:
-            cio.write(bytearray([11, len(data)]))
-            length = 2
-        elif length < 65536:
-            cio.write(b'\x11')
-            cio.write(STRUCT_H.pack(length))
-            length = 3
-        elif length <= 0xFFFFFFFF:
-            cio.write(b'\x12')
+        if can_be_encoded_as_a_dict(data):
+            if length < 16:
+                cio.write(bytearray([0b01010000 | length]))
+                length = 1
+            elif length < 256:
+                cio.write(bytearray([11, len(data)]))
+                length = 2
+            elif length < 65536:
+                cio.write(b'\x11')
+                cio.write(STRUCT_H.pack(length))
+                length = 3
+            elif length <= 0xFFFFFFFF:
+                cio.write(b'\x12')
+                cio.write(STRUCT_L.pack(length))
+                length = 5
+            try:
+                for field_name, elem in data.items():
+                    cio.write(bytearray([len(field_name)]))
+                    cio.write(field_name.encode('utf-8'))
+                    length += dump(elem, cio)
+            except TypeError as e:
+                raise EncodingError('Keys have to be strings!') from e
+            return length
+        else:
+            cio.write(b'\x13')
             cio.write(STRUCT_L.pack(length))
-            length = 5
-        try:
-            for field_name, elem in data.items():
-                cio.write(bytearray([len(field_name)]))
-                cio.write(field_name.encode('utf-8'))
-                length += dump(elem, cio)
-        except TypeError as e:
-            raise EncodingError('Keys have to be strings!') from e
-        return length
+            offset = 5
+            for key, value in data.items():
+                offset += dump(key, cio)
+                offset += dump(value, cio)
+            return offset
     else:
         raise EncodingError('Unknown value type %s' % (data, ))
 
diff --git a/setup.py b/setup.py
index ed1ad9c..16efff6 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,7 @@ if 'DEBUG' in os.environ:
     directives['embedsignature'] = True
 
 
-setup(version='1.3a1',
+setup(version='1.3',
       packages=find_packages(include=['minijson', 'minijson.*']),
       ext_modules=build([Multibuild('minijson', find_pyx('minijson'),
                                     dont_snakehouse=dont_snakehouse), ],
diff --git a/tests/test_minijson.py b/tests/test_minijson.py
index e485af7..0b2cacc 100644
--- a/tests/test_minijson.py
+++ b/tests/test_minijson.py
@@ -34,6 +34,12 @@ class TestMiniJSON(unittest.TestCase):
             a[str(i)] = i
         self.assertSameAfterDumpsAndLoads(a)
 
+    def test_dicts_not_string_keys(self):
+        a = {}
+        for i in range(17):
+            a[i] = i
+        self.assertSameAfterDumpsAndLoads(a)
+
     def test_long_dicts_and_lists(self):
         a = {}
         for i in range(65535):
@@ -60,7 +66,6 @@ class TestMiniJSON(unittest.TestCase):
     def test_loads_exception(self):
         b = b'\x1F'
         self.assertRaises(DecodingError, lambda: loads(b))
-        self.assertRaises(EncodingError, lambda: dumps({1: 2}))
 
     def test_loads(self):
         a = loads(b'\x0B\x03\x04name\x84land\x0Boperator_id\x84dupa\x0Aparameters\x0B\x03\x03lat\x09B4\xeb\x85\x03lon\x09B[33\x03alt\x09Cj\x00\x00')
-- 
GitLab