From 4b570ff0df2e7782d55c20399a1590cf55b3f29a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BF=98=E5=BF=A7=E5=8C=97=E8=90=B1=E8=8D=89?=
 <wybxc@qq.com>
Date: Wed, 27 Nov 2024 00:32:59 +0800
Subject: [PATCH 1/8] feat: test code for arcfour.py

---
 pdf2zh/arcfour.py     |  2 ++
 pyproject.toml        |  3 ++-
 tests/test_arcfour.py | 60 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_arcfour.py

diff --git a/pdf2zh/arcfour.py b/pdf2zh/arcfour.py
index cc78e3612..5e524dbf9 100644
--- a/pdf2zh/arcfour.py
+++ b/pdf2zh/arcfour.py
@@ -13,6 +13,8 @@ def __init__(self, key: Sequence[int]) -> None:
         s = [i for i in range(256)]
         j = 0
         klen = len(key)
+        if not (0 < klen < 256):
+            raise ValueError("key must be non-empty and less than 256 bytes")
         for i in range(256):
             j = (j + s[i] + key[i % klen]) % 256
             (s[i], s[j]) = (s[j], s[i])
diff --git a/pyproject.toml b/pyproject.toml
index e95c3f2c5..28fce2115 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,8 @@ torch = [
 dev = [
     "black",
     "flake8",
-    "pre-commit"
+    "pre-commit",
+    "pytest",
 ]
 
 [project.urls]
diff --git a/tests/test_arcfour.py b/tests/test_arcfour.py
new file mode 100644
index 000000000..39559fbc9
--- /dev/null
+++ b/tests/test_arcfour.py
@@ -0,0 +1,60 @@
+import pytest
+from pdf2zh.arcfour import Arcfour
+
+
+def arcfour_encrypt(plaintext: bytes, key: bytes) -> bytes:
+    cipher = Arcfour(key)
+    return cipher.encrypt(plaintext)
+
+
+def arcfour_decrypt(ciphertext: bytes, key: bytes) -> bytes:
+    cipher = Arcfour(key)
+    return cipher.decrypt(ciphertext)
+
+
+def test_basic_functionality():
+    plaintext = b"Hello, World!"
+    key = b"mysecretkey"
+    ciphertext = arcfour_encrypt(plaintext, key)
+    decrypted_text = arcfour_decrypt(ciphertext, key)
+    assert decrypted_text == plaintext
+
+def test_ciphertext():
+    plaintext = b"Hello, World!"
+    key = b"mysecretkey"
+    ciphertext = arcfour_encrypt(plaintext, key)
+    assert ciphertext.hex() == "a2b614d4af651ec7af7f5259db"
+
+def test_empty_plaintext():
+    plaintext = b""
+    key = b"testkey"
+    ciphertext = arcfour_encrypt(plaintext, key)
+    decrypted_text = arcfour_decrypt(ciphertext, key)
+    assert decrypted_text == plaintext
+
+
+def test_empty_key():
+    plaintext = b"Test data"
+    key = b""
+    with pytest.raises(ValueError):  # Key must be non-empty
+        arcfour_encrypt(plaintext, key)
+
+
+def test_large_key():
+    plaintext = b"Test data"
+    key = b"X" * 256
+    with pytest.raises(ValueError):  # Key must be less than 256 bytes
+        arcfour_encrypt(plaintext, key)
+
+def test_large_key2():
+    plaintext = b"Test data"
+    key = b"X" * 255
+    ciphertext = arcfour_encrypt(plaintext, key)
+    decrypted_text = arcfour_decrypt(ciphertext, key)
+    assert decrypted_text == plaintext
+
+def test_randomness():
+    plaintext = b"AAAAA"
+    key = b"randomkey"
+    ciphertexts = [arcfour_encrypt(plaintext, key) for _ in range(10)]
+    assert len(set(ciphertexts)) <= 1  # All ciphertexts should be the same

From 25937abc435b03ce38195c6f327a7d5fc4ccd977 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BF=98=E5=BF=A7=E5=8C=97=E8=90=B1=E8=8D=89?=
 <wybxc@qq.com>
Date: Wed, 27 Nov 2024 01:00:39 +0800
Subject: [PATCH 2/8] feat: test ascii85

---
 pdf2zh/ascii85.py     | 36 ++++++++++++-------------
 tests/test_ascii85.py | 62 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 18 deletions(-)
 create mode 100644 tests/test_ascii85.py

diff --git a/pdf2zh/ascii85.py b/pdf2zh/ascii85.py
index 233bc744a..2091aa47f 100644
--- a/pdf2zh/ascii85.py
+++ b/pdf2zh/ascii85.py
@@ -4,7 +4,6 @@
 
 """
 
-import re
 import struct
 
 
@@ -38,14 +37,13 @@ def ascii85decode(data: bytes) -> bytes:
                     b = b * 85 + 84
                 out += struct.pack(">L", b)[: n - 1]
             break
+        elif c.isspace():
+            continue
+        else:
+            raise ValueError("Bad character in ASCII85Decode")
     return out
 
 
-# asciihexdecode(data)
-hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE)
-trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE)
-
-
 def asciihexdecode(data: bytes) -> bytes:
     """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
     For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
@@ -56,15 +54,17 @@ def asciihexdecode(data: bytes) -> bytes:
     will behave as if a 0 followed the last digit.
     """
 
-    def decode(x: bytes) -> bytes:
-        i = int(x, 16)
-        return bytes((i,))
-
-    out = b""
-    for x in hex_re.findall(data):
-        out += decode(x)
-
-    m = trail_re.search(data)
-    if m:
-        out += decode(m.group(1) + b"0")
-    return out
+    hex_str = b""
+    for i in data:
+        c = bytes((i,))
+        if b"0" <= c <= b"9" or b"a" <= c <= b"f" or b"A" <= c <= b"F":
+            hex_str += c
+        elif c == b">":
+            break
+        elif c in b" \n\r\t":
+            continue
+        else:
+            raise ValueError("Bad character in ASCIIHexDecode")
+    if len(hex_str) % 2 == 1:
+        hex_str += b"0"
+    return bytes.fromhex(hex_str.decode())
diff --git a/tests/test_ascii85.py b/tests/test_ascii85.py
new file mode 100644
index 000000000..d1208a1e6
--- /dev/null
+++ b/tests/test_ascii85.py
@@ -0,0 +1,62 @@
+import pytest
+from pdf2zh.ascii85 import ascii85decode, asciihexdecode
+
+
+def test_ascii85decode_basic():
+    encoded = b"87cURD_*#TDfTZ)+T"
+    decoded = ascii85decode(encoded + b"~>")
+    assert decoded == b"Hello, world!"
+
+
+def test_ascii85decode_empty():
+    encoded = b""
+    decoded = ascii85decode(encoded)
+    assert decoded == b""
+
+
+def test_ascii85decode_with_z():
+    encoded = b"z"
+    decoded = ascii85decode(encoded)
+    assert decoded == b"\x00\x00\x00\x00"
+
+
+def test_ascii85decode_partial_group():
+    encoded = b"9jqo^BlbD-BleB1DJ+*+F(f,q"  # Encodes 'Man is distinguished'
+    decoded = ascii85decode(encoded + b"~>")
+    assert decoded.startswith(b"Man is distinguished")
+
+
+def test_ascii85decode_with_termination():
+    encoded = b"9jqo^~>"
+    decoded = ascii85decode(encoded)
+    assert decoded == b"Man "
+
+
+def test_asciihexdecode_basic():
+    encoded = b"48656C6C6F>"
+    decoded = asciihexdecode(encoded)
+    assert decoded == b"Hello"
+
+
+def test_asciihexdecode_with_whitespace():
+    encoded = b"48 65 6C 6C 6F >"
+    decoded = asciihexdecode(encoded)
+    assert decoded == b"Hello"
+
+
+def test_asciihexdecode_odd_length():
+    encoded = b"48656C6C6F3"
+    decoded = asciihexdecode(encoded)
+    assert decoded == b"Hello0"
+
+
+def test_asciihexdecode_empty():
+    encoded = b""
+    decoded = asciihexdecode(encoded)
+    assert decoded == b""
+
+
+def test_asciihexdecode_invalid_characters():
+    encoded = b"ZZ>"
+    with pytest.raises(ValueError):
+        asciihexdecode(encoded)

From 61a7698cb78ed4074bd8a7d07d01f3e7f28a1890 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BF=98=E5=BF=A7=E5=8C=97=E8=90=B1=E8=8D=89?=
 <wybxc@qq.com>
Date: Wed, 27 Nov 2024 01:03:16 +0800
Subject: [PATCH 3/8] feat: test casting

---
 tests/test_casting.py | 59 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 tests/test_casting.py

diff --git a/tests/test_casting.py b/tests/test_casting.py
new file mode 100644
index 000000000..610310c06
--- /dev/null
+++ b/tests/test_casting.py
@@ -0,0 +1,59 @@
+from pdf2zh.casting import safe_int, safe_float
+
+
+def test_safe_int_valid_string():
+    assert safe_int("123") == 123
+
+
+def test_safe_int_valid_integer():
+    assert safe_int(123) == 123
+
+
+def test_safe_int_valid_float_string():
+    assert safe_int("123.45") is None
+
+
+def test_safe_int_invalid_string():
+    assert safe_int("abc") is None
+
+
+def test_safe_int_none():
+    assert safe_int(None) is None
+
+
+def test_safe_int_boolean():
+    assert safe_int(True) == 1
+    assert safe_int(False) == 0
+
+
+def test_safe_int_special_chars():
+    assert safe_int("!@#") is None
+
+
+def test_safe_float_valid_float():
+    assert safe_float("123.45") == 123.45
+
+
+def test_safe_float_valid_integer_string():
+    assert safe_float("123") == 123.0
+
+
+def test_safe_float_scientific_notation():
+    assert safe_float("1.23e-4") == 0.000123
+
+
+def test_safe_float_invalid_string():
+    assert safe_float("abc") is None
+
+
+def test_safe_float_none():
+    assert safe_float(None) is None
+
+
+def test_safe_float_boolean():
+    assert safe_float(True) == 1.0
+    assert safe_float(False) == 0.0
+
+
+def test_safe_float_special_chars():
+    assert safe_float("!@#") is None

From 22d3c6956433d768f6fc8fadd6b181dcc2fb5a4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BF=98=E5=BF=A7=E5=8C=97=E8=90=B1=E8=8D=89?=
 <wybxc@qq.com>
Date: Wed, 27 Nov 2024 01:12:29 +0800
Subject: [PATCH 4/8] feat: add test for ccitt

---
 tests/test_ccitt.py | 107 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 tests/test_ccitt.py

diff --git a/tests/test_ccitt.py b/tests/test_ccitt.py
new file mode 100644
index 000000000..8359ea781
--- /dev/null
+++ b/tests/test_ccitt.py
@@ -0,0 +1,107 @@
+import pytest
+from pdf2zh.ccitt import BitParser, CCITTG4Parser, CCITTFaxDecoder, ccittfaxdecode
+
+class TestBitParser:
+    def test_init(self):
+        parser = BitParser()
+        assert parser._pos == 0
+        
+    def test_add_bits(self):
+        root = [None, None]
+        BitParser.add(root, 1, "1")
+        assert root[1] == 1
+        assert root[0] is None
+
+    def test_feedbytes_simple(self):
+        parser = BitParser()
+        parser._state = [None, None]
+        parser._accept = lambda x: [None, None]
+        parser.feedbytes(b'\x80')  # 10000000
+        assert parser._pos == 8
+
+
+class TestCCITTG4Parser:
+    def test_init(self):
+        parser = CCITTG4Parser(width=8)
+        assert parser.width == 8
+        assert parser.bytealign is False
+        assert len(parser._curline) == 8
+        
+    def test_mode_parsing(self):
+        parser = CCITTG4Parser(width=8)
+        # Test pass mode
+        new_state = parser._parse_mode("p")
+        assert new_state == parser.MODE
+        
+        # Test horizontal mode
+        new_state = parser._parse_mode("h")
+        assert new_state in (parser.WHITE, parser.BLACK)
+        
+    def test_vertical_coding(self):
+        parser = CCITTG4Parser(width=8)
+        parser._curpos = 0
+        parser._color = 1
+        parser._do_vertical(0)  # No offset
+        assert parser._curpos >= 0
+        
+    @pytest.mark.parametrize("n1,n2", [(1,1), (2,2), (3,3)])
+    def test_horizontal_coding(self, n1, n2):
+        parser = CCITTG4Parser(width=8)
+        parser._curpos = 0
+        parser._color = 1
+        parser._do_horizontal(n1, n2)
+        assert parser._curpos == n1 + n2
+
+
+class TestCCITTFaxDecoder:
+    def test_init(self):
+        decoder = CCITTFaxDecoder(width=8)
+        assert decoder.width == 8
+        assert decoder.reversed is False
+        assert decoder._buf == b""
+        
+    def test_decode_empty(self):
+        decoder = CCITTFaxDecoder(width=8)
+        result = decoder.close()
+        assert result == b""
+        
+    def test_basic_decoding(self):
+        decoder = CCITTFaxDecoder(width=8)
+        decoder.feedbytes(b'\x00')
+        result = decoder.close()
+        assert isinstance(result, bytes)
+
+
+def test_ccittfaxdecode():
+    params = {
+        'K': -1,
+        'Columns': 8,
+        'EncodedByteAlign': False,
+        'BlackIs1': False
+    }
+    result = ccittfaxdecode(b'\x00', params)
+    assert isinstance(result, bytes)
+    
+    with pytest.raises(Exception):
+        ccittfaxdecode(b'\x00', {'K': 0})  # Should raise for unsupported K value
+
+
+@pytest.fixture
+def basic_parser():
+    return CCITTG4Parser(width=8)
+
+
+class TestErrorHandling:
+    def test_invalid_mode(self, basic_parser):
+        with pytest.raises(CCITTG4Parser.InvalidData):
+            basic_parser._parse_mode("invalid")
+            
+    def test_eofb_detection(self, basic_parser):
+        with pytest.raises(CCITTG4Parser.EOFB):
+            basic_parser._parse_mode("e")
+            
+    def test_byte_alignment(self):
+        parser = CCITTG4Parser(width=8, bytealign=True)
+        parser._curpos = 8  # Simulate end of line
+        with pytest.raises(CCITTG4Parser.ByteSkip):
+            parser._flush_line()
\ No newline at end of file

From 8bfda2f5d64c5588b36f4f8abbd5ae2f7109a55c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BF=98=E5=BF=A7=E5=8C=97=E8=90=B1=E8=8D=89?=
 <wybxc@qq.com>
Date: Wed, 27 Nov 2024 01:17:34 +0800
Subject: [PATCH 5/8] chore: format code

---
 tests/test_ccitt.py | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/tests/test_ccitt.py b/tests/test_ccitt.py
index 8359ea781..ebdc634fc 100644
--- a/tests/test_ccitt.py
+++ b/tests/test_ccitt.py
@@ -1,11 +1,12 @@
 import pytest
 from pdf2zh.ccitt import BitParser, CCITTG4Parser, CCITTFaxDecoder, ccittfaxdecode
 
+
 class TestBitParser:
     def test_init(self):
         parser = BitParser()
         assert parser._pos == 0
-        
+
     def test_add_bits(self):
         root = [None, None]
         BitParser.add(root, 1, "1")
@@ -16,7 +17,7 @@ def test_feedbytes_simple(self):
         parser = BitParser()
         parser._state = [None, None]
         parser._accept = lambda x: [None, None]
-        parser.feedbytes(b'\x80')  # 10000000
+        parser.feedbytes(b"\x80")  # 10000000
         assert parser._pos == 8
 
 
@@ -26,25 +27,25 @@ def test_init(self):
         assert parser.width == 8
         assert parser.bytealign is False
         assert len(parser._curline) == 8
-        
+
     def test_mode_parsing(self):
         parser = CCITTG4Parser(width=8)
         # Test pass mode
         new_state = parser._parse_mode("p")
         assert new_state == parser.MODE
-        
+
         # Test horizontal mode
         new_state = parser._parse_mode("h")
         assert new_state in (parser.WHITE, parser.BLACK)
-        
+
     def test_vertical_coding(self):
         parser = CCITTG4Parser(width=8)
         parser._curpos = 0
         parser._color = 1
         parser._do_vertical(0)  # No offset
         assert parser._curpos >= 0
-        
-    @pytest.mark.parametrize("n1,n2", [(1,1), (2,2), (3,3)])
+
+    @pytest.mark.parametrize("n1,n2", [(1, 1), (2, 2), (3, 3)])
     def test_horizontal_coding(self, n1, n2):
         parser = CCITTG4Parser(width=8)
         parser._curpos = 0
@@ -59,31 +60,26 @@ def test_init(self):
         assert decoder.width == 8
         assert decoder.reversed is False
         assert decoder._buf == b""
-        
+
     def test_decode_empty(self):
         decoder = CCITTFaxDecoder(width=8)
         result = decoder.close()
         assert result == b""
-        
+
     def test_basic_decoding(self):
         decoder = CCITTFaxDecoder(width=8)
-        decoder.feedbytes(b'\x00')
+        decoder.feedbytes(b"\x00")
         result = decoder.close()
         assert isinstance(result, bytes)
 
 
 def test_ccittfaxdecode():
-    params = {
-        'K': -1,
-        'Columns': 8,
-        'EncodedByteAlign': False,
-        'BlackIs1': False
-    }
-    result = ccittfaxdecode(b'\x00', params)
+    params = {"K": -1, "Columns": 8, "EncodedByteAlign": False, "BlackIs1": False}
+    result = ccittfaxdecode(b"\x00", params)
     assert isinstance(result, bytes)
-    
+
     with pytest.raises(Exception):
-        ccittfaxdecode(b'\x00', {'K': 0})  # Should raise for unsupported K value
+        ccittfaxdecode(b"\x00", {"K": 0})  # Should raise for unsupported K value
 
 
 @pytest.fixture
@@ -95,13 +91,13 @@ class TestErrorHandling:
     def test_invalid_mode(self, basic_parser):
         with pytest.raises(CCITTG4Parser.InvalidData):
             basic_parser._parse_mode("invalid")
-            
+
     def test_eofb_detection(self, basic_parser):
         with pytest.raises(CCITTG4Parser.EOFB):
             basic_parser._parse_mode("e")
-            
+
     def test_byte_alignment(self):
         parser = CCITTG4Parser(width=8, bytealign=True)
         parser._curpos = 8  # Simulate end of line
         with pytest.raises(CCITTG4Parser.ByteSkip):
-            parser._flush_line()
\ No newline at end of file
+            parser._flush_line()

From 947279ad44d64fa98e7b3ed1e8c5470570a941c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BF=98=E5=BF=A7=E5=8C=97=E8=90=B1=E8=8D=89?=
 <wybxc@qq.com>
Date: Wed, 27 Nov 2024 01:24:30 +0800
Subject: [PATCH 6/8] feat: add test for cmapdb

---
 tests/test_cmapdb.py | 118 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 tests/test_cmapdb.py

diff --git a/tests/test_cmapdb.py b/tests/test_cmapdb.py
new file mode 100644
index 000000000..e622d460e
--- /dev/null
+++ b/tests/test_cmapdb.py
@@ -0,0 +1,118 @@
+import pytest
+
+from pdf2zh.cmapdb import (
+    CMapBase,
+    CMap,
+    IdentityCMap,
+    IdentityCMapByte,
+    UnicodeMap,
+    IdentityUnicodeMap,
+    FileCMap,
+    FileUnicodeMap,
+    PSLiteral,
+    PDFTypeError,
+)
+
+
+def test_cmap_base():
+    cmap = CMapBase(WMode=0, Name="TestMap")
+    assert cmap.attrs["WMode"] == 0
+    assert cmap.attrs["Name"] == "TestMap"
+    assert not cmap.is_vertical()
+
+    cmap.set_attr("WMode", 1)
+    assert cmap.is_vertical()
+
+
+def test_cmap():
+    cmap = CMap(CMapName="TestCMap")
+    assert repr(cmap) == "<CMap: TestCMap>"
+
+    # Test code2cid mapping
+    cmap.code2cid = {1: 100, 2: {3: 200}}
+    decoded = list(cmap.decode(bytes([1])))
+    assert decoded == [100]
+
+    decoded = list(cmap.decode(bytes([2, 3])))
+    assert decoded == [200]
+
+
+def test_cmap_use_cmap():
+    cmap1 = CMap()
+    cmap1.code2cid = {1: 100, 2: {3: 200}}
+
+    cmap2 = CMap()
+    cmap2.use_cmap(cmap1)
+    assert cmap2.code2cid == {1: 100, 2: {3: 200}}
+
+
+def test_identity_cmap():
+    cmap = IdentityCMap()
+    result = cmap.decode(b"\x00\x01\x00\x02")
+    assert result == (1, 2)
+
+    # Test empty input
+    assert cmap.decode(b"") == ()
+
+
+def test_identity_cmap_byte():
+    cmap = IdentityCMapByte()
+    result = cmap.decode(b"\x01\x02\x03")
+    assert result == (1, 2, 3)
+
+    # Test empty input
+    assert cmap.decode(b"") == ()
+
+
+def test_unicode_map():
+    umap = UnicodeMap(CMapName="TestUnicode")
+    assert repr(umap) == "<UnicodeMap: TestUnicode>"
+
+    umap.cid2unichr[100] = "A"
+    assert umap.get_unichr(100) == "A"
+
+
+def test_identity_unicode_map():
+    umap = IdentityUnicodeMap()
+    assert umap.get_unichr(65) == "A"  # ASCII 65 = 'A'
+    assert umap.get_unichr(0x4E00) == "一"  # CJK Unified Ideograph
+
+
+def test_file_cmap():
+    cmap = FileCMap()
+    cmap.add_code2cid("AB", 100)
+
+    result = list(cmap.decode(bytes([ord("A"), ord("B")])))
+    assert result == [100]
+
+
+def test_file_unicode_map():
+    umap = FileUnicodeMap()
+
+    # Test with PSLiteral
+    umap.add_cid2unichr(100, PSLiteral("A"))
+    assert umap.get_unichr(100) == "A"
+
+    # Test with bytes (UTF-16BE)
+    umap.add_cid2unichr(101, b"\x00A")
+    assert umap.get_unichr(101) == "A"
+
+    # Test with integer
+    umap.add_cid2unichr(102, 65)  # ASCII 65 = 'A'
+    assert umap.get_unichr(102) == "A"
+
+    # Test invalid input
+    with pytest.raises(PDFTypeError):
+        umap.add_cid2unichr(103, 1.5)  # type: ignore
+
+
+def test_file_unicode_map_space_handling():
+    umap = FileUnicodeMap()
+
+    # Add regular space
+    umap.add_cid2unichr(100, 32)  # space character
+    assert umap.get_unichr(100) == " "
+
+    # Try to add non-breaking space - should be ignored
+    umap.add_cid2unichr(100, 0xA0)  # non-breaking space
+    assert umap.get_unichr(100) == " "  # should still be regular space

From 3a684748785e108db8be295fb71aab230995485f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BF=98=E5=BF=A7=E5=8C=97=E8=90=B1=E8=8D=89?=
 <wybxc@qq.com>
Date: Wed, 27 Nov 2024 01:28:12 +0800
Subject: [PATCH 7/8] feat: add test for data_structures

---
 tests/test_data_structures.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 tests/test_data_structures.py

diff --git a/tests/test_data_structures.py b/tests/test_data_structures.py
new file mode 100644
index 000000000..58ba408a5
--- /dev/null
+++ b/tests/test_data_structures.py
@@ -0,0 +1,35 @@
+import pytest
+from pdf2zh.data_structures import NumberTree
+from pdf2zh import settings
+
+
+def test_number_tree_empty():
+    tree = NumberTree({})
+    assert tree.values == []
+
+
+def test_number_tree_leaf():
+    tree = NumberTree({"Nums": [1, "a", 2, "b", 3, "c"]})
+    assert tree.values == [(1, "a"), (2, "b"), (3, "c")]
+
+
+def test_number_tree_kids():
+    child1 = {"Nums": [1, "a", 2, "b"]}
+    child2 = {"Nums": [3, "c", 4, "d"]}
+    tree = NumberTree({"Kids": [child1, child2]})
+    assert tree.values == [(1, "a"), (2, "b"), (3, "c"), (4, "d")]
+
+
+def test_number_tree_unordered():
+    tree = NumberTree({"Nums": [3, "c", 1, "a", 2, "b"]})
+    if settings.STRICT:
+        with pytest.raises(Exception):
+            tree.values
+    else:
+        assert tree.values == [(1, "a"), (2, "b"), (3, "c")]
+
+
+def test_number_tree_limits():
+    tree = NumberTree({"Nums": [1, "a"], "Limits": [1, 1]})
+    assert tree.values == [(1, "a")]
+    assert tree.limits == [1, 1]

From a30f89d719c780e528a7eba59920eb792cd4d9bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BF=98=E5=BF=A7=E5=8C=97=E8=90=B1=E8=8D=89?=
 <wybxc@qq.com>
Date: Wed, 27 Nov 2024 01:29:33 +0800
Subject: [PATCH 8/8] feat: add test for encodingdb

---
 tests/test_encodingdb.py | 64 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 tests/test_encodingdb.py

diff --git a/tests/test_encodingdb.py b/tests/test_encodingdb.py
new file mode 100644
index 000000000..95d72b777
--- /dev/null
+++ b/tests/test_encodingdb.py
@@ -0,0 +1,64 @@
+import pytest
+from pdf2zh.encodingdb import name2unicode, EncodingDB, PDFKeyError
+from pdf2zh.psparser import PSLiteral
+
+
+def test_name2unicode_basic():
+    assert name2unicode("A") == "A"
+    assert name2unicode("dollar") == "$"
+
+
+def test_name2unicode_composite():
+    assert name2unicode("A_B") == "AB"
+    assert name2unicode("A_B_C") == "ABC"
+
+
+def test_name2unicode_uni():
+    assert name2unicode("uni0041") == "A"  # Unicode A
+    assert name2unicode("uni00410042") == "AB"  # Unicode AB
+
+
+def test_name2unicode_u():
+    assert name2unicode("u0041") == "A"  # Unicode A
+    assert name2unicode("u1F600") == "😀"  # Unicode emoji
+
+
+def test_name2unicode_invalid_type():
+    with pytest.raises(PDFKeyError):
+        name2unicode(123)  # type: ignore
+
+
+def test_name2unicode_invalid_name():
+    with pytest.raises(PDFKeyError):
+        name2unicode("invalid_name")
+
+
+def test_name2unicode_invalid_unicode():
+    with pytest.raises(PDFKeyError):
+        name2unicode("uniD800")  # Surrogate pair range
+
+
+def test_encoding_db_basic():
+    assert "StandardEncoding" in EncodingDB.encodings
+    assert "MacRomanEncoding" in EncodingDB.encodings
+    assert "WinAnsiEncoding" in EncodingDB.encodings
+    assert "PDFDocEncoding" in EncodingDB.encodings
+
+
+def test_encoding_db_get_encoding():
+    encoding = EncodingDB.get_encoding("StandardEncoding")
+    assert isinstance(encoding, dict)
+    assert 65 in encoding  # ASCII 'A'
+
+
+def test_encoding_db_get_encoding_with_diff():
+    diff = [0, PSLiteral("A"), PSLiteral("B")]
+    encoding = EncodingDB.get_encoding("StandardEncoding", diff)
+    assert encoding[0] == "A"
+    assert encoding[1] == "B"
+
+
+def test_encoding_db_get_encoding_invalid():
+    # Should return StandardEncoding for invalid encoding name
+    encoding = EncodingDB.get_encoding("InvalidEncoding")
+    assert encoding == EncodingDB.std2unicode