From c8114cbb13491d117883fdcf7ee51873d2677e87 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Tue, 12 Jun 2018 20:51:20 +0200
Subject: [PATCH 01/18] Yes. Let's play tough.

---
 lib/parsers/pdf_parser.rb | 118 ++++++++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 30 deletions(-)

diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
index f562dcbd..b7a7d8a8 100644
--- a/lib/parsers/pdf_parser.rb
+++ b/lib/parsers/pdf_parser.rb
@@ -22,54 +22,112 @@ def call(io)
 
     return unless safe_read(io, 9) =~ PDF_MARKER
 
-    attributes = scan_for_attributes(io)
+    io.seek(io.size - 5)
+#    return unless safe_read(io, 5) == '%%EOF'
 
+    xref_offset = locate_xref_table_offset(io)
+    return unless xref_offset
+
+    io.seek(xref_offset)
+    xref_table = parse_xref_table(io)
+
+    # return unless xref_table.any?
+    xref_table.each do |xref|
+      io.seek(xref.offset)
+      $stderr.puts io.read(xref.length_limit).inspect
+    end
+
+    raise "nope"
     FormatParser::Document.new(
       format: :pdf,
       page_count: attributes[:page_count]
     )
   end
 
-  private
+  def locate_xref_table_offset(io)
+    # Read the "tail" of the PDF and find the 'startxref' declaration
+    assumed_xref_table_size = 1024
+    tail_pos = io.size - assumed_xref_table_size
+    tail_pos = 0 if tail_pos < 0
+    io.seek(tail_pos)
+    tail = io.read(assumed_xref_table_size)
 
-  # Read ahead bytes until one of % or / is reached.
-  # A header in a PDF always starts with a /
-  # The % is to detect the EOF
-  #
-  def scan_for_attributes(io)
-    result = {}
-
-    while read = safe_read(io, 1)
-      case read
-      when '%'
-        break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
-      when '/'
-        find_page_count(io, result)
+    # Find the "startxref" declaration and read the first group of integers after it
+    start_xref_index = tail.index('startxref')
+    return unless start_xref_index
+
+    startxref = tail.byteslice(start_xref_index, 1024)[/\d+/]
+    return unless startxref
+
+    startxref.to_i
+  end
+
+  XRef = Struct.new(:idx, :offset, :generation_number, :entry_type, :length_limit)
+  def parse_xref_table(io)
+    xref_table = []
+    starting_idx = 0
+    num_objects_cross_check = nil
+    while line = read_until_linebreak(io, char_limit: 32)
+      case line
+      when /xref/
+        # Starts the cross-reference table
+      when /^(\d+) (\d+)$/
+        # Defines the starting number of the object and the number of objects in the table
+        starting_idx = $1.to_i
+        num_objects_cross_check = $2.to_i
+      when /^(\d{10}) (\d{5}) (\w)$/
+        # The actual object offset. Set the length limit to a ridiculous value since we don't know it
+        xref_table << XRef.new(starting_idx + xref_table.length, $1.to_i, $2.to_i, $3, 99999999)
+      when /trailer/
+        break
       end
     end
 
-    result
+    # Check if the number of xrefs we got makes sense
+    if num_objects_cross_check && num_objects_cross_check != xref_table.length
+      raise "The xref table was declared to contain #{num_objects_cross_check} object refs but contained #{xref_table.length}"
+    end
+
+    # Reject all disabled objects
+    xref_table.reject! {|e| e.entry_type == 'f' }
+
+    # Sort sequentially in ascending offset in document order
+    xref_table.sort_by!(&:offset)
+
+    # Update the limits which will tell us how much we need to read to have the entire object
+    pairwise(xref_table) do |xref_a, xref_b|
+      xref_a.length_limit = xref_b.offset - xref_a.offset
+    end
+
+    xref_table.each do |x|
+      $stderr.puts x.inspect
+    end
+    
+    xref_table
   end
 
-  def find_page_count(io, result)
-    COUNT_MARKERS.each do |marker|
-      if safe_read(io, marker.size) == marker
-        result[:page_count] = read_numbers(io)
+  def pairwise(enum)
+    pair = []
+    enum.each do |e|
+      pair << e
+      if pair.length == 2
+        yield(pair.first, pair.last)
+        pair.shift
       end
     end
   end
 
-  # Read ahead bytes until no more numbers are found
-  # This assumes that the position of io starts at a
-  # number
-  def read_numbers(io)
-    numbers = ''
-
-    while c = safe_read(io, 1)
-      c =~ /\d+/ ? numbers << c : break
+  def read_until_linebreak(io, char_limit: 32)
+    buf = StringIO.new(''.b)
+    char_limit.times do
+      char = safe_read(io, 1).force_encoding(Encoding::BINARY)
+      if char == "\n"
+        break
+      else
+        buf << char
+      end
     end
-
-    numbers.to_i
+    buf.string.strip
   end
 
   FormatParser.register_parser self, natures: :document, formats: :pdf

From f50f18cec92fc47eaf6dd7d949db30d948c5cad8 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Tue, 12 Jun 2018 21:29:30 +0200
Subject: [PATCH 02/18] Yep yep yep

---
 lib/parsers/pdf_parser.rb | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
index b7a7d8a8..00d73aa5 100644
--- a/lib/parsers/pdf_parser.rb
+++ b/lib/parsers/pdf_parser.rb
@@ -34,7 +34,9 @@ def call(io)
     # return unless xref_table.any?
     xref_table.each do |xref|
       io.seek(xref.offset)
-      $stderr.puts io.read(xref.length_limit).inspect
+      if xref.length_limit < 128
+        $stderr.puts io.read(xref.length_limit).inspect
+      end
     end
 
     raise "nope"
@@ -49,6 +51,7 @@ def locate_xref_table_offset(io)
     assumed_xref_table_size = 1024
     tail_pos = io.size - assumed_xref_table_size
     tail_pos = 0 if tail_pos < 0
+
     io.seek(tail_pos)
     tail = io.read(assumed_xref_table_size)
 
@@ -56,7 +59,7 @@ def locate_xref_table_offset(io)
     start_xref_index = tail.index('startxref')
     return unless start_xref_index
 
-    startxref = tail.byteslice(start_xref_index, 1024)[/\d+/]
+    startxref = tail.byteslice(start_xref_index, assumed_xref_table_size)[/\d+/]
     return unless startxref
 
     startxref.to_i
@@ -102,7 +105,7 @@ def parse_xref_table(io)
     xref_table.each do |x|
       $stderr.puts x.inspect
     end
-    
+
     xref_table
   end
 

From 75f513c9df5d8d5a8b6fab435833340af67f7f19 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Wed, 13 Jun 2018 03:40:22 +0200
Subject: [PATCH 03/18] Yes yes

---
 ...fe693a8f87297f8c255d899c61b063016a4.pdfobj |   7 +
 ...4f5f4914c1f71f4916f75894e6a89e780d6.pdfobj |   7 +
 ...c658d480be83743bd54554dabc4a6681bce.pdfobj |   9 +
 ...7eb7632f0370bc3463a3884acdf3386ed83.pdfobj |   7 +
 ...a257840100914b09ce644376375450a0611.pdfobj |   8 +
 ...9334c1be555436ca56d50f5a12df1c701f6.pdfobj |   8 +
 spec/parsers/pdf_parser/example_a.pdfobj      |   9 +
 ...a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj |   8 +
 spec/parsers/pdf_parser/object_parser_spec.rb | 248 ++++++++++++++++++
 9 files changed, 311 insertions(+)
 create mode 100644 spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj
 create mode 100644 spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj
 create mode 100644 spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj
 create mode 100644 spec/parsers/pdf_parser/82cc57eb7632f0370bc3463a3884acdf3386ed83.pdfobj
 create mode 100644 spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj
 create mode 100644 spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj
 create mode 100644 spec/parsers/pdf_parser/example_a.pdfobj
 create mode 100644 spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj
 create mode 100644 spec/parsers/pdf_parser/object_parser_spec.rb

diff --git a/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj b/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj
new file mode 100644
index 00000000..2530644b
--- /dev/null
+++ b/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj
@@ -0,0 +1,7 @@
+44 0 obj
+<</Type/Catalog/Pages 31 0 R
+/OpenAction[1 0 R /XYZ null null 0]
+/Lang(en-US)
+>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj b/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj
new file mode 100644
index 00000000..e3ff9300
--- /dev/null
+++ b/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj
@@ -0,0 +1,7 @@
+12 0 obj
+<</Type/Catalog/Pages 4 0 R
+/OpenAction[1 0 R /XYZ null null 0]
+/Lang(en-US)
+>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj b/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj
new file mode 100644
index 00000000..c2f92e6d
--- /dev/null
+++ b/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj
@@ -0,0 +1,9 @@
+ 0 obj
+<</Type/Pages
+/Resources 11 0 R
+/MediaBox[ 0 0 595 842 ]
+/Kids[ 1 0 R ]
+endobj
+
+12 0 obj
+<</
\ No newline at end of file
diff --git a/spec/parsers/pdf_parser/82cc57eb7632f0370bc3463a3884acdf3386ed83.pdfobj b/spec/parsers/pdf_parser/82cc57eb7632f0370bc3463a3884acdf3386ed83.pdfobj
new file mode 100644
index 00000000..5659872d
--- /dev/null
+++ b/spec/parsers/pdf_parser/82cc57eb7632f0370bc3463a3884acdf3386ed83.pdfobj
@@ -0,0 +1,7 @@
+15 0 obj
+<</Type/Catalog/Pages 7 0 R
+/OpenAction[1 0 R /XYZ null null 0]
+/Lang(en-US)
+>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj b/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj
new file mode 100644
index 00000000..4fe62abd
--- /dev/null
+++ b/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj
@@ -0,0 +1,8 @@
+4 0 obj
+<</Type/Pages
+/Resources 11 0 R
+/MediaBox[ 0 0 595 842 ]
+/Kids[ 1 0 R ]
+/Count 1>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj b/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj
new file mode 100644
index 00000000..1cb8e07f
--- /dev/null
+++ b/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj
@@ -0,0 +1,8 @@
+31 0 obj
+<</Type/Pages
+/Resources 43 0 R
+/MediaBox[ 0 0 595 842 ]
+/Kids[ 1 0 R 4 0 R 7 0 R 10 0 R 13 0 R 16 0 R 19 0 R 22 0 R 25 0 R 28 0 R ]
+/Count 10>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/example_a.pdfobj b/spec/parsers/pdf_parser/example_a.pdfobj
new file mode 100644
index 00000000..6d82200b
--- /dev/null
+++ b/spec/parsers/pdf_parser/example_a.pdfobj
@@ -0,0 +1,9 @@
+[
+ <<
+ /Name (Jim)
+ /Age 39
+ /Children [(Heather) (Timothy) (Rebecca)]
+ >>
+ 22
+ 44.55
+]
\ No newline at end of file
diff --git a/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj b/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj
new file mode 100644
index 00000000..24c92285
--- /dev/null
+++ b/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj
@@ -0,0 +1,8 @@
+7 0 obj
+<</Type/Pages
+/Resources 14 0 R
+/MediaBox[ 0 0 595 842 ]
+/Kids[ 1 0 R 4 0 R ]
+/Count 2>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb
new file mode 100644
index 00000000..490d75ad
--- /dev/null
+++ b/spec/parsers/pdf_parser/object_parser_spec.rb
@@ -0,0 +1,248 @@
+require 'spec_helper'
+
+class NuObjectParser
+  Malformed = Class.new(RuntimeError)
+  RE = ->(str) { /#{Regexp.escape(str)}/ }
+  STRATEGIES = {
+    RE["/"]  => :parse_pdf_name,
+    RE["<<"] => :parse_dictionary,
+    RE["["]  => :parse_array,
+    RE["("]  => :parse_string,
+    RE["<"]  => :parse_hex_string,
+    /\d+ \d+ R/ => :parse_ref,
+
+    RE["true"]  => :wrap,
+    RE["false"] => :wrap,
+    RE["null"]  => :wrap,
+
+    /\-?(\d+)\.(\d+)/ => :wrap_real,
+    /\-?(\d+)/ => :wrap_int,
+
+    RE["obj"]       => :wrap,
+    RE["endobj"]    => :wrap,
+    RE["stream"]    => :wrap,
+    RE["endstream"] => :wrap,
+#    RE[">>"]        => :wrap,
+#    RE["]"]         => :wrap,
+#    RE[">"]         => :wrap,
+#    RE[")"]         => :wrap,
+
+    /\s+/           => :wrap_whitespace,
+  }
+
+  STRING_ESCAPES = {
+    "\r"   => "\n",
+    "\n\r" => "\n",
+    "\r\n" => "\n",
+    "\\n"  => "\n",
+    "\\r"  => "\r",
+    "\\t"  => "\t",
+    "\\b"  => "\b",
+    "\\f"  => "\f",
+    "\\("  => "(",
+    "\\)"  => ")",
+    "\\\\" => "\\",
+    "\\\n" => "",
+  }
+  0.upto(9)   { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr }
+  0.upto(99)  { |n| STRING_ESCAPES["\\0" + n.to_s]  = ("0"+n.to_s).oct.chr }
+  0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s]   = n.to_s.oct.chr }
+
+  def wrap_true(sc, pattern)
+    @sc.scan(pattern)
+    true
+  end
+
+  def wrap_false(pattern)
+    @sc.scan(pattern)
+    false
+  end
+  
+  def wrap_nil(pattern)
+    @sc.scan(pattern)
+    nil
+  end
+
+  def wrap_real(pattern)
+    @sc.scan(pattern).to_f
+  end
+
+  def wrap_int(pattern)
+    @sc.scan(pattern).to_i
+  end
+
+  def wrap_whitespace(pattern)
+    @sc.scan(pattern)
+    :whitespace
+  end
+
+  def wrap(pattern)
+    data = @sc.scan(pattern)
+    data.to_sym
+  end
+
+  def consume!(pattern, method_name)
+    return unless @sc.check(pattern)
+    at = @sc.pos
+    result = send(method_name, pattern)
+    @token_stream << result unless result == :whitespace
+    true
+  end
+
+  def parse_ref(start_pattern)
+    [:ref, @sc.scan(start_pattern)]
+  end
+
+  def parse_array(start_pattern)
+    @sc.scan(start_pattern) # consume [
+    dict_open_at = @token_stream.length
+    walk_scanner(RE["]"])
+    raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator
+    array_items = @token_stream.pop(@token_stream.length - dict_open_at)
+    [:array, array_items]
+  end
+
+  def parse_dictionary(start_pattern)
+    @sc.scan(start_pattern) # consume <<
+    dict_open_at = @token_stream.length
+    walk_scanner(RE[">>"])
+    raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator
+    dict_items = @token_stream.pop(@token_stream.length - dict_open_at)
+    [:dict, dict_items]
+  end
+
+  def parse_string(start_pattern)
+    rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped )
+    raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string
+    rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
+      STRING_ESCAPES[match] || ""
+    end
+  end
+
+  def parse_pdf_name(start_pattern)
+    letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join + "/"
+    warn("Name parsing needs validation since start pattern is not the same as scan pattern")
+    [:name, @sc.scan(/\/[#{letters}\d]+/)]
+  end
+  
+  def walk_scanner(halt_at_pattern)
+    until @sc.eos?
+      # Terminate early
+      if halt_at_pattern && halted = @sc.scan(halt_at_pattern)
+        @token_stream << :terminator
+        return
+      end
+
+      # Walk through STRATEGIES and stop iterating on first non-false call to consume!
+      STRATEGIES.find do |pattern, method_name|
+        consume!(pattern, method_name)
+      end
+    end
+  end
+
+  def parse(str)
+    @sc = StringScanner.new(str)
+    @token_stream = []
+    walk_scanner(_stop_at_pattern = nil)
+    @token_stream
+  end
+end
+
+describe 'Object parser' do
+  let(:fixture_paths) { Dir.glob(__dir__ + '/*.pdfobj').sort }
+
+  xit 'scans the extracted object definitions from the corpus' do
+    fixture_paths.each do |path|
+      result = NuObjectParser.new.parse(File.read(path))
+    end
+  end
+
+  it 'scans the example object from the PDF presentation' do
+    obj = File.read(__dir__ + '/example_a.pdfobj')
+    parser = NuObjectParser.new
+    result = parser.parse(obj)
+    expect(result).to eq(
+      [
+        [:array, [
+          [:dict, [
+            [:name, "/Name"], "Jim",
+            [:name, "/Age"], 39,
+            [:name, "/Children"], [:array, ["Heather", "Timothy", "Rebecca"]]]
+          ],
+          22,
+          44.55]
+        ]
+      ]
+    )
+  end
+
+  it 'scans a simple dictionary with strings and ints as values' do
+    result = NuObjectParser.new.parse('<</Name (Jim) /Age 25>>')
+    expect(result).to eq(
+      [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], 25]]]
+    )
+  end
+
+  it 'scans a simple dictionary with arbitrary whitespace' do
+    result = NuObjectParser.new.parse('<<
+      /Name
+        (Jim)
+      /Age
+        25>>')
+    expect(result).to eq(
+      [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], 25]]]
+    )
+  end
+
+  it 'parses an array of integers' do
+    result = NuObjectParser.new.parse('[1 2 3 4]')
+    expect(result).to eq(
+      [[:array, [1, 2, 3, 4]]]
+    )
+  end
+
+  it 'scans an array of integers with one object ref in the middle' do
+    result = NuObjectParser.new.parse('[1 20 00 R 3]')
+    expect(result).to eq(
+      [[:array, [1, [:ref, "20 00 R"], 3]]]
+    )
+  end
+
+  it 'scans an array of names' do
+    result = NuObjectParser.new.parse('[ /Type /Color /Medium/Rare ]')
+    expect(result).to eq(
+      [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium/Rare"]]]]
+    )
+  end
+
+  it 'handles string escapes' do
+    result = NuObjectParser.new.parse("(Foo \\(with some bars\\))")
+    expect(result).to eq(
+      ["Foo (with some bars)"]
+    )
+  end
+
+  it 'detects an unterminated string' do
+    expect {
+      NuObjectParser.new.parse("(Hello there")
+    }.to raise_error(/did not terminate/)
+  end
+
+  it 'detects an unterminated array' do
+    expect {
+      NuObjectParser.new.parse("[")
+    }.to raise_error(/did not terminate/)
+  end
+
+  it 'detects an unterminated dictionary' do
+    expect {
+      NuObjectParser.new.parse("<< /Ohai")
+    }.to raise_error(/did not terminate/)
+  end
+
+  it 'detects a truncated dictionary opener' do
+    expect {
+      NuObjectParser.new.parse('<</')
+    }.to raise_error(/did not terminate/)
+  end
+end

From 41fd9e4d9b6bb73a484ce8b07d219e60a56a2143 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Wed, 13 Jun 2018 11:24:48 +0200
Subject: [PATCH 04/18] Comment more stuff

---
 lib/parsers/pdf_parser.rb | 41 ++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
index 00d73aa5..9510236b 100644
--- a/lib/parsers/pdf_parser.rb
+++ b/lib/parsers/pdf_parser.rb
@@ -14,7 +14,6 @@ class FormatParser::PDFParser
   # this. The only way of solving this correctly is by adding
   # different types of PDF's in the specs.
   #
-  COUNT_MARKERS = ['Count ']
   EOF_MARKER    = '%EOF'
 
   def call(io)
@@ -34,8 +33,24 @@ def call(io)
     # return unless xref_table.any?
     xref_table.each do |xref|
       io.seek(xref.offset)
-      if xref.length_limit < 128
-        $stderr.puts io.read(xref.length_limit).inspect
+      # From here on out we need to proceed as follows. We need to buffer (preemptively)
+      # all the /Type/Pages objects for later. We also need to recover the
+      # /Type/Catalog object which will refer us to the right /Type /Pages object to use.
+      # It is a good idea to scan only once, and we also should be "economical" in reading these.
+      # All the objects we care about start with the object header ("45 0 obj" etc)
+      # and then must contain an arbitrary amount of whitespace (which we scientifically
+      # followed by the dictionary open brackets - "<<".
+      # Then we need to actually go in, read the object and parse the dictionary - luckily
+      # this is not that much trouble and we can read the entire object, since it is small.
+      # So let's get at it.
+      next if xref.length_limit > 1024 # Skip objects which are too large, they won't be headers anyway 
+
+      # Do a quickie detection reading just a tiny piece of the object
+      obj_header = safe_read(io, 32)
+      if obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog')
+        io.seek(xref.offset)
+        object_buf = io.read(xref.length_limit)
+        parse_object_with_dictionary(object_buf)
       end
     end
 
@@ -49,8 +64,7 @@ def call(io)
   def locate_xref_table_offset(io)
     # Read the "tail" of the PDF and find the 'startxref' declaration
     assumed_xref_table_size = 1024
-    tail_pos = io.size - assumed_xref_table_size
-    tail_pos = 0 if tail_pos < 0
+    tail_pos = max(0, io.size - assumed_xref_table_size)
 
     io.seek(tail_pos)
     tail = io.read(assumed_xref_table_size)
@@ -66,6 +80,7 @@ def locate_xref_table_offset(io)
   end
 
   XRef = Struct.new(:idx, :offset, :generation_number, :entry_type, :length_limit)
+
   def parse_xref_table(io)
     xref_table = []
     starting_idx = 0
@@ -102,10 +117,6 @@ def parse_xref_table(io)
       xref_a.length_limit = xref_b.offset - xref_a.offset
     end
 
-    xref_table.each do |x|
-      $stderr.puts x.inspect
-    end
-
     xref_table
   end
 
@@ -133,5 +144,17 @@ def read_until_linebreak(io, char_limit: 32)
     buf.string.strip
   end
 
+  def min(*of_items)
+    of_items.sort.shift
+  end
+
+  def max(*of_items)
+    of_items.sort.pop
+  end
+
+  def parse_object_with_dictionary(str)
+    File.open(Digest::SHA1.hexdigest(str) + '.pdfobj', 'wb') {|f| f << str }
+  end
+
   FormatParser.register_parser self, natures: :document, formats: :pdf
 end

From eb0670ef3a4cff3bd79b91e49e87dd6f9db2cf6b Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Wed, 13 Jun 2018 11:25:16 +0200
Subject: [PATCH 05/18] Hex string handling

---
 spec/parsers/pdf_parser/object_parser_spec.rb | 43 ++++++++++++++++---
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb
index 490d75ad..678049b9 100644
--- a/spec/parsers/pdf_parser/object_parser_spec.rb
+++ b/spec/parsers/pdf_parser/object_parser_spec.rb
@@ -28,6 +28,7 @@ class NuObjectParser
 #    RE[")"]         => :wrap,
 
     /\s+/           => :wrap_whitespace,
+    /./             => :garbage,
   }
 
   STRING_ESCAPES = {
@@ -111,6 +112,15 @@ def parse_dictionary(start_pattern)
     [:dict, dict_items]
   end
 
+  def parse_hex_string(start_pattern)
+    str = @sc.scan(/<[0-9a-f]+>/i)
+    raise Malformed, "Malformed hex string at #{@sc.pos}" unless str
+
+    str << "0" unless str.size % 2 == 0
+    hex_str = str.scan(/../).map {|i| i.hex.chr}.join
+    [:hex_string, hex_str]
+  end
+
   def parse_string(start_pattern)
     rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped )
     raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string
@@ -121,12 +131,20 @@ def parse_string(start_pattern)
 
   def parse_pdf_name(start_pattern)
     letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join + "/"
-    warn("Name parsing needs validation since start pattern is not the same as scan pattern")
-    [:name, @sc.scan(/\/[#{letters}\d]+/)]
+    name = @sc.scan(/\/[#{letters}\d]+/)
+    raise Malformed, "Expected a well-formed PDF name at #{@sc.pos} but could not recover any" unless name
+    [:name, name]
   end
-  
+
+  def garbage(*)
+    raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one"
+  end
+
   def walk_scanner(halt_at_pattern)
-    until @sc.eos?
+    (@sc.string.bytesize - @sc.pos).times do
+      # Terminate if EOS reached
+      break if @sc.eos?
+
       # Terminate early
       if halt_at_pattern && halted = @sc.scan(halt_at_pattern)
         @token_stream << :terminator
@@ -151,7 +169,7 @@ def parse(str)
 describe 'Object parser' do
   let(:fixture_paths) { Dir.glob(__dir__ + '/*.pdfobj').sort }
 
-  xit 'scans the extracted object definitions from the corpus' do
+  it 'scans the extracted object definitions from the corpus' do
     fixture_paths.each do |path|
       result = NuObjectParser.new.parse(File.read(path))
     end
@@ -243,6 +261,19 @@ def parse(str)
   it 'detects a truncated dictionary opener' do
     expect {
       NuObjectParser.new.parse('<</')
-    }.to raise_error(/did not terminate/)
+    }.to raise_error(/PDF name at 2/)
   end
+
+  it 'responds well to fuzzed input' do
+    random = Random.new(12345)
+    1024.times do
+      begin
+        result = NuObjectParser.new.parse(random.bytes(128))
+        expect(result).to be_kind_of(Array)
+      rescue NuObjectParser::Malformed
+        # Everything good, we failed as we should
+      end
+    end
+  end
+
 end

From f2724abdd7696325f1c4418a9cb6b984a4a03b76 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Thu, 14 Jun 2018 13:48:11 +0200
Subject: [PATCH 06/18] Deal with reals better

---
 spec/parsers/pdf_parser/nu_object_parser.rb   | 152 +++++++++++++
 spec/parsers/pdf_parser/object_parser_spec.rb | 213 +++---------------
 2 files changed, 187 insertions(+), 178 deletions(-)
 create mode 100644 spec/parsers/pdf_parser/nu_object_parser.rb

diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb
new file mode 100644
index 00000000..300d1a48
--- /dev/null
+++ b/spec/parsers/pdf_parser/nu_object_parser.rb
@@ -0,0 +1,152 @@
+class NuObjectParser
+  Malformed = Class.new(RuntimeError)
+  RE = ->(str) { /#{Regexp.escape(str)}/ }
+  STRATEGIES = {
+    RE["/"]  => :parse_pdf_name,
+    RE["<<"] => :parse_dictionary,
+    RE["["]  => :parse_array,
+    RE["("]  => :parse_string,
+    RE["<"]  => :parse_hex_string,
+    /\d+ \d+ R/ => :parse_ref,
+
+    RE["true"]  => :wrap,
+    RE["false"] => :wrap,
+    RE["null"]  => :wrap,
+
+    # 34.5 −3.62 +123.6 4. −.002 0.0 are all valid reals
+    /(\-|\+?)(\d+)\.(\d+)/ => :wrap_real,
+    /(\-|\+?)(\d+)\./ => :wrap_real,
+    /(\-|\+?)\.(\d+)/ => :wrap_real,
+    /\-?(\d+)/ => :wrap_int,
+
+    RE["obj"]       => :wrap,
+    RE["endobj"]    => :wrap,
+    RE["stream"]    => :wrap,
+    RE["endstream"] => :wrap,
+
+    /\s+/           => :wrap_whitespace,
+    /./             => :garbage,
+  }
+
+  STRING_ESCAPES = {
+    "\r"   => "\n",
+    "\n\r" => "\n",
+    "\r\n" => "\n",
+    "\\n"  => "\n",
+    "\\r"  => "\r",
+    "\\t"  => "\t",
+    "\\b"  => "\b",
+    "\\f"  => "\f",
+    "\\("  => "(",
+    "\\)"  => ")",
+    "\\\\" => "\\",
+    "\\\n" => "",
+  }
+  0.upto(9)   { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr }
+  0.upto(99)  { |n| STRING_ESCAPES["\\0" + n.to_s]  = ("0"+n.to_s).oct.chr }
+  0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s]   = n.to_s.oct.chr }
+
+  def wrap_real(pattern)
+    [:real, @sc.scan(pattern).to_f]
+  end
+
+  def wrap_int(pattern)
+    [:int, @sc.scan(pattern).to_i]
+  end
+
+  def wrap_whitespace(pattern)
+    @sc.scan(pattern)
+    [:whitespace, nil]
+  end
+
+  def wrap(pattern)
+    [:lit, @sc.scan(pattern).to_sym]
+  end
+
+  def consume!(pattern, method_name)
+    at = @sc.pos
+    unless @sc.check(pattern)
+      $stderr.puts " : #{pattern} -> #{method_name} @#{at}: will scan #{@sc.peek(8).inspect}..."
+      return false
+    end
+    $stderr.puts   "M: #{pattern} -> #{method_name} @#{at}: will scan #{@sc.peek(8).inspect}..."
+    result = send(method_name, pattern)
+    @token_stream << result unless result == [:whitespace, nil]
+    true
+  end
+
+  def parse_ref(start_pattern)
+    [:ref, @sc.scan(start_pattern)]
+  end
+
+  def parse_array(start_pattern)
+    @sc.scan(start_pattern) # consume [
+    dict_open_at = @token_stream.length
+    walk_scanner(RE["]"])
+    raise Malformed, "Array did not terminate" unless @token_stream.pop == :terminator
+    array_items = @token_stream.pop(@token_stream.length - dict_open_at)
+    [:array, array_items]
+  end
+
+  def parse_dictionary(start_pattern)
+    @sc.scan(start_pattern) # consume <<
+    dict_open_at = @token_stream.length
+    walk_scanner(RE[">>"])
+    raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator
+    dict_items = @token_stream.pop(@token_stream.length - dict_open_at)
+    [:dict, dict_items]
+  end
+
+  def parse_hex_string(start_pattern)
+    str = @sc.scan(/<[0-9a-f]+>/i)
+    raise Malformed, "Malformed hex string at #{@sc.pos}" unless str
+
+    str << "0" unless str.bytesize % 2 == 0
+    hex_str = str.scan(/../).map {|i| i.hex.chr}.join
+    [:hex_string, hex_str]
+  end
+
+  def parse_string(start_pattern)
+    rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped )
+    raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string
+    rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
+      STRING_ESCAPES[match] || ""
+    end
+  end
+
+  def parse_pdf_name(start_pattern)
+    letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join
+    name = @sc.scan(/\/[#{letters}\d]+/)
+    raise Malformed, "Expected a well-formed PDF name at #{@sc.pos} but could not recover any" unless name
+    [:name, name]
+  end
+
+  def garbage(*)
+    raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one"
+  end
+
+  def walk_scanner(halt_at_pattern)
+    (@sc.string.bytesize - @sc.pos).times do
+      # Terminate if EOS reached
+      break if @sc.eos?
+
+      # Terminate early
+      if halt_at_pattern && halted = @sc.scan(halt_at_pattern)
+        @token_stream << :terminator
+        return
+      end
+
+      # Walk through STRATEGIES and stop iterating on first non-false call to consume!
+      STRATEGIES.find do |pattern, method_name|
+        consume!(pattern, method_name)
+      end
+    end
+  end
+
+  def parse(str)
+    @sc = StringScanner.new(str)
+    @token_stream = []
+    walk_scanner(_stop_at_pattern = nil)
+    @token_stream
+  end
+end
diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb
index 678049b9..6ab6fe87 100644
--- a/spec/parsers/pdf_parser/object_parser_spec.rb
+++ b/spec/parsers/pdf_parser/object_parser_spec.rb
@@ -1,177 +1,15 @@
 require 'spec_helper'
-
-class NuObjectParser
-  Malformed = Class.new(RuntimeError)
-  RE = ->(str) { /#{Regexp.escape(str)}/ }
-  STRATEGIES = {
-    RE["/"]  => :parse_pdf_name,
-    RE["<<"] => :parse_dictionary,
-    RE["["]  => :parse_array,
-    RE["("]  => :parse_string,
-    RE["<"]  => :parse_hex_string,
-    /\d+ \d+ R/ => :parse_ref,
-
-    RE["true"]  => :wrap,
-    RE["false"] => :wrap,
-    RE["null"]  => :wrap,
-
-    /\-?(\d+)\.(\d+)/ => :wrap_real,
-    /\-?(\d+)/ => :wrap_int,
-
-    RE["obj"]       => :wrap,
-    RE["endobj"]    => :wrap,
-    RE["stream"]    => :wrap,
-    RE["endstream"] => :wrap,
-#    RE[">>"]        => :wrap,
-#    RE["]"]         => :wrap,
-#    RE[">"]         => :wrap,
-#    RE[")"]         => :wrap,
-
-    /\s+/           => :wrap_whitespace,
-    /./             => :garbage,
-  }
-
-  STRING_ESCAPES = {
-    "\r"   => "\n",
-    "\n\r" => "\n",
-    "\r\n" => "\n",
-    "\\n"  => "\n",
-    "\\r"  => "\r",
-    "\\t"  => "\t",
-    "\\b"  => "\b",
-    "\\f"  => "\f",
-    "\\("  => "(",
-    "\\)"  => ")",
-    "\\\\" => "\\",
-    "\\\n" => "",
-  }
-  0.upto(9)   { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr }
-  0.upto(99)  { |n| STRING_ESCAPES["\\0" + n.to_s]  = ("0"+n.to_s).oct.chr }
-  0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s]   = n.to_s.oct.chr }
-
-  def wrap_true(sc, pattern)
-    @sc.scan(pattern)
-    true
-  end
-
-  def wrap_false(pattern)
-    @sc.scan(pattern)
-    false
-  end
-  
-  def wrap_nil(pattern)
-    @sc.scan(pattern)
-    nil
-  end
-
-  def wrap_real(pattern)
-    @sc.scan(pattern).to_f
-  end
-
-  def wrap_int(pattern)
-    @sc.scan(pattern).to_i
-  end
-
-  def wrap_whitespace(pattern)
-    @sc.scan(pattern)
-    :whitespace
-  end
-
-  def wrap(pattern)
-    data = @sc.scan(pattern)
-    data.to_sym
-  end
-
-  def consume!(pattern, method_name)
-    return unless @sc.check(pattern)
-    at = @sc.pos
-    result = send(method_name, pattern)
-    @token_stream << result unless result == :whitespace
-    true
-  end
-
-  def parse_ref(start_pattern)
-    [:ref, @sc.scan(start_pattern)]
-  end
-
-  def parse_array(start_pattern)
-    @sc.scan(start_pattern) # consume [
-    dict_open_at = @token_stream.length
-    walk_scanner(RE["]"])
-    raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator
-    array_items = @token_stream.pop(@token_stream.length - dict_open_at)
-    [:array, array_items]
-  end
-
-  def parse_dictionary(start_pattern)
-    @sc.scan(start_pattern) # consume <<
-    dict_open_at = @token_stream.length
-    walk_scanner(RE[">>"])
-    raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator
-    dict_items = @token_stream.pop(@token_stream.length - dict_open_at)
-    [:dict, dict_items]
-  end
-
-  def parse_hex_string(start_pattern)
-    str = @sc.scan(/<[0-9a-f]+>/i)
-    raise Malformed, "Malformed hex string at #{@sc.pos}" unless str
-
-    str << "0" unless str.size % 2 == 0
-    hex_str = str.scan(/../).map {|i| i.hex.chr}.join
-    [:hex_string, hex_str]
-  end
-
-  def parse_string(start_pattern)
-    rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped )
-    raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string
-    rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
-      STRING_ESCAPES[match] || ""
-    end
-  end
-
-  def parse_pdf_name(start_pattern)
-    letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join + "/"
-    name = @sc.scan(/\/[#{letters}\d]+/)
-    raise Malformed, "Expected a well-formed PDF name at #{@sc.pos} but could not recover any" unless name
-    [:name, name]
-  end
-
-  def garbage(*)
-    raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one"
-  end
-
-  def walk_scanner(halt_at_pattern)
-    (@sc.string.bytesize - @sc.pos).times do
-      # Terminate if EOS reached
-      break if @sc.eos?
-
-      # Terminate early
-      if halt_at_pattern && halted = @sc.scan(halt_at_pattern)
-        @token_stream << :terminator
-        return
-      end
-
-      # Walk through STRATEGIES and stop iterating on first non-false call to consume!
-      STRATEGIES.find do |pattern, method_name|
-        consume!(pattern, method_name)
-      end
-    end
-  end
-
-  def parse(str)
-    @sc = StringScanner.new(str)
-    @token_stream = []
-    walk_scanner(_stop_at_pattern = nil)
-    @token_stream
-  end
-end
+require_relative 'nu_object_parser'
 
 describe 'Object parser' do
-  let(:fixture_paths) { Dir.glob(__dir__ + '/*.pdfobj').sort }
-
-  it 'scans the extracted object definitions from the corpus' do
+  describe 'with extracted objects from corpus' do
+    fixture_paths = Dir.glob(__dir__ + '/*.pdfobj').sort
     fixture_paths.each do |path|
-      result = NuObjectParser.new.parse(File.read(path))
+      it "scans #{File.basename(path)}" do
+        result = NuObjectParser.new.parse(File.read(path))
+        require 'pp'
+        pp result
+      end
     end
   end
 
@@ -184,11 +22,11 @@ def parse(str)
         [:array, [
           [:dict, [
             [:name, "/Name"], "Jim",
-            [:name, "/Age"], 39,
+            [:name, "/Age"], [:int, 39],
             [:name, "/Children"], [:array, ["Heather", "Timothy", "Rebecca"]]]
           ],
-          22,
-          44.55]
+          [:int, 22],
+          [:real, 44.55]]
         ]
       ]
     )
@@ -197,7 +35,7 @@ def parse(str)
   it 'scans a simple dictionary with strings and ints as values' do
     result = NuObjectParser.new.parse('<</Name (Jim) /Age 25>>')
     expect(result).to eq(
-      [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], 25]]]
+      [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], [:int, 25]]]]
     )
   end
 
@@ -208,28 +46,40 @@ def parse(str)
       /Age
         25>>')
     expect(result).to eq(
-      [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], 25]]]
+      [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], [:int, 25]]]]
+    )
+  end
+
+  it 'parses all kinds of reals' do
+    result = NuObjectParser.new.parse('34.5 -3.62 +123.6 4. -.002 0.0')
+    expect(result).to eq(
+      [[:real, 34.5], [:real, -3.62], [:real, 123.6], [:real, 4.0], [:real, -0.002], [:real, 0.0]]
     )
   end
 
   it 'parses an array of integers' do
     result = NuObjectParser.new.parse('[1 2 3 4]')
     expect(result).to eq(
-      [[:array, [1, 2, 3, 4]]]
+      [[:array, [[:int, 1], [:int, 2], [:int, 3], [:int, 4]]]]
     )
   end
 
   it 'scans an array of integers with one object ref in the middle' do
     result = NuObjectParser.new.parse('[1 20 00 R 3]')
     expect(result).to eq(
-      [[:array, [1, [:ref, "20 00 R"], 3]]]
+      [[:array, [[:int, 1], [:ref, "20 00 R"], [:int, 3]]]]
     )
   end
 
   it 'scans an array of names' do
     result = NuObjectParser.new.parse('[ /Type /Color /Medium/Rare ]')
     expect(result).to eq(
-      [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium/Rare"]]]]
+      [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium"], [:name, "/Rare"]]]]
+    )
+
+    result = NuObjectParser.new.parse('[/Type/Color/Medium/Rare]')
+    expect(result).to eq(
+      [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium"], [:name, "/Rare"]]]]
     )
   end
 
@@ -240,6 +90,13 @@ def parse(str)
     )
   end
 
+  it 'handles paired braces in strings escapes' do
+    result = NuObjectParser.new.parse("(Foo () bar and (baz))")
+    expect(result).to eq(
+      ["Foo (with some bars)"]
+    )
+  end
+
   it 'detects an unterminated string' do
     expect {
       NuObjectParser.new.parse("(Hello there")

From 1d451359f7d556c4b07e40a6ae77140fb0847774 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Thu, 14 Jun 2018 14:21:07 +0200
Subject: [PATCH 07/18] Improve debug prints a little

---
 spec/parsers/pdf_parser/nu_object_parser.rb | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb
index 300d1a48..98b087df 100644
--- a/spec/parsers/pdf_parser/nu_object_parser.rb
+++ b/spec/parsers/pdf_parser/nu_object_parser.rb
@@ -65,11 +65,8 @@ def wrap(pattern)
 
   def consume!(pattern, method_name)
     at = @sc.pos
-    unless @sc.check(pattern)
-      $stderr.puts " : #{pattern} -> #{method_name} @#{at}: will scan #{@sc.peek(8).inspect}..."
-      return false
-    end
-    $stderr.puts   "M: #{pattern} -> #{method_name} @#{at}: will scan #{@sc.peek(8).inspect}..."
+    return false unless @sc.check(pattern)
+    debug { "M: #{method_name} @#{at}: 8 chars after scan pointer #{@sc.peek(8).inspect}" }
     result = send(method_name, pattern)
     @token_stream << result unless result == [:whitespace, nil]
     true
@@ -149,4 +146,8 @@ def parse(str)
     walk_scanner(_stop_at_pattern = nil)
     @token_stream
   end
+
+  def debug
+    $stderr.puts(yield)
+  end
 end

From c02f5bc617fb60768bbb3dab14384915b2401c94 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Thu, 14 Jun 2018 14:27:41 +0200
Subject: [PATCH 08/18] Explain the loop limiter

---
 spec/parsers/pdf_parser/nu_object_parser.rb | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb
index 98b087df..ed0d2245 100644
--- a/spec/parsers/pdf_parser/nu_object_parser.rb
+++ b/spec/parsers/pdf_parser/nu_object_parser.rb
@@ -123,6 +123,11 @@ def garbage(*)
   end
 
   def walk_scanner(halt_at_pattern)
+    # Limit the iterations to AT MOST (!) once per
+    # remaining byte to parse. This ensures we won't
+    # have parsing enter an infinite loop where we expect
+    # the string scanner to have advanced at least a byte forward
+    # but it would sit on the same offset indifinitely.
     (@sc.string.bytesize - @sc.pos).times do
       # Terminate if EOS reached
       break if @sc.eos?
@@ -134,6 +139,8 @@ def walk_scanner(halt_at_pattern)
       end
 
       # Walk through STRATEGIES and stop iterating on first non-false call to consume!
+      # STRATEGIES are arranged by order of specificity, so for most iterations
+      # somethign meaningful should be hit relatively quickly
       STRATEGIES.find do |pattern, method_name|
         consume!(pattern, method_name)
       end

From 56548a6aa2c6933a1ad43c4723fdfa33d9c576b4 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Thu, 14 Jun 2018 15:37:38 +0200
Subject: [PATCH 09/18] Improve handling of names

---
 spec/parsers/pdf_parser/nu_object_parser.rb   | 36 ++++++++++++++++---
 spec/parsers/pdf_parser/object_parser_spec.rb | 32 +++++++++++++++++
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb
index ed0d2245..e8c01915 100644
--- a/spec/parsers/pdf_parser/nu_object_parser.rb
+++ b/spec/parsers/pdf_parser/nu_object_parser.rb
@@ -1,13 +1,31 @@
 class NuObjectParser
   Malformed = Class.new(RuntimeError)
   RE = ->(str) { /#{Regexp.escape(str)}/ }
+
+  NAME_RE = begin
+    # The ASCII subset permissible for PDF name values
+    printable_ascii = (32..126).to_a
+    printable_ascii.delete(' '.ord)
+    printable_ascii.delete('['.ord)
+    printable_ascii.delete(']'.ord)
+    printable_ascii.delete('<'.ord)
+    printable_ascii.delete('>'.ord)
+    printable_ascii.delete('('.ord)
+    printable_ascii.delete(')'.ord)
+    printable_ascii.delete('/'.ord)
+    printable_ascii.delete('\\'.ord)
+    exact_char_class = printable_ascii.map(&:chr).join
+    
+    /\/[#{exact_char_class}]{0,}/
+  end
+
   STRATEGIES = {
-    RE["/"]  => :parse_pdf_name,
     RE["<<"] => :parse_dictionary,
     RE["["]  => :parse_array,
     RE["("]  => :parse_string,
     RE["<"]  => :parse_hex_string,
     /\d+ \d+ R/ => :parse_ref,
+    NAME_RE  => :parse_pdf_name,
 
     RE["true"]  => :wrap,
     RE["false"] => :wrap,
@@ -28,6 +46,7 @@ class NuObjectParser
     /./             => :garbage,
   }
 
+  # Permitted character escapes. There aren't _that_ many so we can use a table
   STRING_ESCAPES = {
     "\r"   => "\n",
     "\n\r" => "\n",
@@ -42,6 +61,8 @@ class NuObjectParser
     "\\\\" => "\\",
     "\\\n" => "",
   }
+
+  # Octal character escapes that look like \001 etc
   0.upto(9)   { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr }
   0.upto(99)  { |n| STRING_ESCAPES["\\0" + n.to_s]  = ("0"+n.to_s).oct.chr }
   0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s]   = n.to_s.oct.chr }
@@ -104,6 +125,9 @@ def parse_hex_string(start_pattern)
   end
 
   def parse_string(start_pattern)
+    # This is murder. PDF allows paired braces to be put into a string literal
+    # without any escaping. This means that "(Horrible file format (with a cherry on top))"
+    # is a valid string. Needs attention.
     rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped )
     raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string
     rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
@@ -112,10 +136,12 @@ def parse_string(start_pattern)
   end
 
   def parse_pdf_name(start_pattern)
-    letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join
-    name = @sc.scan(/\/[#{letters}\d]+/)
-    raise Malformed, "Expected a well-formed PDF name at #{@sc.pos} but could not recover any" unless name
-    [:name, name]
+    name = @sc.scan(start_pattern)
+    # Replace #023 hex codes with the corresponding chars
+    name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |hex_code|
+      $1.to_i(16).chr
+    end
+    [:name, name_sans_escapes]
   end
 
   def garbage(*)
diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb
index 6ab6fe87..9ba84494 100644
--- a/spec/parsers/pdf_parser/object_parser_spec.rb
+++ b/spec/parsers/pdf_parser/object_parser_spec.rb
@@ -83,6 +83,38 @@
     )
   end
 
+  it 'handles names' do
+    names_str = %(
+      /Name1
+      /ASomewhatLongerName /A;Name_With-Various***Characters? /1.2
+      /$$
+      /@pattern
+      /.notdef
+      /Adobe#20Green
+      /PANTONE#205757#20CV
+      /paired#28#29parentheses
+      /The_Key_of_F#23_Minor
+      /A#42
+      /
+    )
+    result = NuObjectParser.new.parse(names_str)
+    expect(result).to eq([
+        [:name, "/Name1"],
+        [:name, "/ASomewhatLongerName"],
+        [:name, "/A;Name_With-Various***Characters?"],
+        [:name, "/1.2"],
+        [:name, "/$$"],
+        [:name, "/@pattern"],
+        [:name, "/.notdef"],
+        [:name, "/Adobe Green"],
+        [:name, "/PANTONE 5757 CV"],
+        [:name, "/paired()parentheses"],
+        [:name, "/The_Key_of_F#_Minor"],
+        [:name, "/AB"],
+        [:name, "/"]
+    ])
+  end
+
   it 'handles string escapes' do
     result = NuObjectParser.new.parse("(Foo \\(with some bars\\))")
     expect(result).to eq(

From 84e2c04eb795e3a4b062f5dc7eb40b08bd740850 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Thu, 14 Jun 2018 16:30:59 +0200
Subject: [PATCH 10/18] Let's do some rubocop here

---
 spec/parsers/pdf_parser/nu_object_parser.rb   | 72 +++++++++----------
 spec/parsers/pdf_parser/object_parser_spec.rb | 71 +++++++++---------
 2 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb
index e8c01915..ffcad3ba 100644
--- a/spec/parsers/pdf_parser/nu_object_parser.rb
+++ b/spec/parsers/pdf_parser/nu_object_parser.rb
@@ -15,21 +15,21 @@ class NuObjectParser
     printable_ascii.delete('/'.ord)
     printable_ascii.delete('\\'.ord)
     exact_char_class = printable_ascii.map(&:chr).join
-    
+
     /\/[#{exact_char_class}]{0,}/
   end
 
   STRATEGIES = {
-    RE["<<"] => :parse_dictionary,
-    RE["["]  => :parse_array,
-    RE["("]  => :parse_string,
-    RE["<"]  => :parse_hex_string,
+    RE['<<'] => :parse_dictionary,
+    RE['[']  => :parse_array,
+    RE['(']  => :parse_string,
+    RE['<']  => :parse_hex_string,
     /\d+ \d+ R/ => :parse_ref,
-    NAME_RE  => :parse_pdf_name,
+    NAME_RE => :parse_pdf_name,
 
-    RE["true"]  => :wrap,
-    RE["false"] => :wrap,
-    RE["null"]  => :wrap,
+    RE['true']  => :wrap,
+    RE['false'] => :wrap,
+    RE['null']  => :wrap,
 
     # 34.5 −3.62 +123.6 4. −.002 0.0 are all valid reals
     /(\-|\+?)(\d+)\.(\d+)/ => :wrap_real,
@@ -37,10 +37,10 @@ class NuObjectParser
     /(\-|\+?)\.(\d+)/ => :wrap_real,
     /\-?(\d+)/ => :wrap_int,
 
-    RE["obj"]       => :wrap,
-    RE["endobj"]    => :wrap,
-    RE["stream"]    => :wrap,
-    RE["endstream"] => :wrap,
+    RE['obj']       => :wrap,
+    RE['endobj']    => :wrap,
+    RE['stream']    => :wrap,
+    RE['endstream'] => :wrap,
 
     /\s+/           => :wrap_whitespace,
     /./             => :garbage,
@@ -51,21 +51,21 @@ class NuObjectParser
     "\r"   => "\n",
     "\n\r" => "\n",
     "\r\n" => "\n",
-    "\\n"  => "\n",
-    "\\r"  => "\r",
-    "\\t"  => "\t",
-    "\\b"  => "\b",
-    "\\f"  => "\f",
-    "\\("  => "(",
-    "\\)"  => ")",
-    "\\\\" => "\\",
-    "\\\n" => "",
+    '\\n'  => "\n",
+    '\\r'  => "\r",
+    '\\t'  => "\t",
+    '\\b'  => "\b",
+    '\\f'  => "\f",
+    '\\('  => '(',
+    '\\)'  => ')',
+    '\\\\' => '\\',
+    "\\\n" => '',
   }
 
   # Octal character escapes that look like \001 etc
-  0.upto(9)   { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr }
-  0.upto(99)  { |n| STRING_ESCAPES["\\0" + n.to_s]  = ("0"+n.to_s).oct.chr }
-  0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s]   = n.to_s.oct.chr }
+  0.upto(9)   { |n| STRING_ESCAPES['\\00' + n.to_s] = ('00' + n.to_s).oct.chr }
+  0.upto(99)  { |n| STRING_ESCAPES['\\0' + n.to_s]  = ('0' + n.to_s).oct.chr }
+  0.upto(377) { |n| STRING_ESCAPES['\\' + n.to_s]   = n.to_s.oct.chr }
 
   def wrap_real(pattern)
     [:real, @sc.scan(pattern).to_f]
@@ -100,8 +100,8 @@ def parse_ref(start_pattern)
   def parse_array(start_pattern)
     @sc.scan(start_pattern) # consume [
     dict_open_at = @token_stream.length
-    walk_scanner(RE["]"])
-    raise Malformed, "Array did not terminate" unless @token_stream.pop == :terminator
+    walk_scanner(RE[']'])
+    raise Malformed, 'Array did not terminate' unless @token_stream.pop == :terminator
     array_items = @token_stream.pop(@token_stream.length - dict_open_at)
     [:array, array_items]
   end
@@ -109,36 +109,36 @@ def parse_array(start_pattern)
   def parse_dictionary(start_pattern)
     @sc.scan(start_pattern) # consume <<
     dict_open_at = @token_stream.length
-    walk_scanner(RE[">>"])
-    raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator
+    walk_scanner(RE['>>'])
+    raise Malformed, 'Dictionary did not terminate' unless @token_stream.pop == :terminator
     dict_items = @token_stream.pop(@token_stream.length - dict_open_at)
     [:dict, dict_items]
   end
 
-  def parse_hex_string(start_pattern)
+  def parse_hex_string(_start_pattern)
     str = @sc.scan(/<[0-9a-f]+>/i)
     raise Malformed, "Malformed hex string at #{@sc.pos}" unless str
 
-    str << "0" unless str.bytesize % 2 == 0
-    hex_str = str.scan(/../).map {|i| i.hex.chr}.join
+    str << '0' unless str.bytesize.even?
+    hex_str = str.scan(/../).map { |i| i.hex.chr }.join
     [:hex_string, hex_str]
   end
 
-  def parse_string(start_pattern)
+  def parse_string(_start_pattern)
     # This is murder. PDF allows paired braces to be put into a string literal
     # without any escaping. This means that "(Horrible file format (with a cherry on top))"
     # is a valid string. Needs attention.
     rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped )
     raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string
     rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
-      STRING_ESCAPES[match] || ""
+      STRING_ESCAPES[match] || ''
     end
   end
 
   def parse_pdf_name(start_pattern)
     name = @sc.scan(start_pattern)
     # Replace #023 hex codes with the corresponding chars
-    name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |hex_code|
+    name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |_hex_code|
       $1.to_i(16).chr
     end
     [:name, name_sans_escapes]
@@ -181,6 +181,6 @@ def parse(str)
   end
 
   def debug
-    $stderr.puts(yield)
+    warn(yield)
   end
 end
diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb
index 9ba84494..aad2997a 100644
--- a/spec/parsers/pdf_parser/object_parser_spec.rb
+++ b/spec/parsers/pdf_parser/object_parser_spec.rb
@@ -19,14 +19,18 @@
     result = parser.parse(obj)
     expect(result).to eq(
       [
-        [:array, [
-          [:dict, [
-            [:name, "/Name"], "Jim",
-            [:name, "/Age"], [:int, 39],
-            [:name, "/Children"], [:array, ["Heather", "Timothy", "Rebecca"]]]
-          ],
-          [:int, 22],
-          [:real, 44.55]]
+        [
+          :array, [
+            [
+              :dict, [
+                [:name, '/Name'], 'Jim',
+                [:name, '/Age'], [:int, 39],
+                [:name, '/Children'], [:array, ['Heather', 'Timothy', 'Rebecca']]
+              ]
+            ],
+            [:int, 22],
+            [:real, 44.55]
+          ]
         ]
       ]
     )
@@ -35,7 +39,7 @@
   it 'scans a simple dictionary with strings and ints as values' do
     result = NuObjectParser.new.parse('<</Name (Jim) /Age 25>>')
     expect(result).to eq(
-      [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], [:int, 25]]]]
+      [[:dict, [[:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 25]]]]
     )
   end
 
@@ -46,7 +50,7 @@
       /Age
         25>>')
     expect(result).to eq(
-      [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], [:int, 25]]]]
+      [[:dict, [[:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 25]]]]
     )
   end
 
@@ -67,19 +71,19 @@
   it 'scans an array of integers with one object ref in the middle' do
     result = NuObjectParser.new.parse('[1 20 00 R 3]')
     expect(result).to eq(
-      [[:array, [[:int, 1], [:ref, "20 00 R"], [:int, 3]]]]
+      [[:array, [[:int, 1], [:ref, '20 00 R'], [:int, 3]]]]
     )
   end
 
   it 'scans an array of names' do
     result = NuObjectParser.new.parse('[ /Type /Color /Medium/Rare ]')
     expect(result).to eq(
-      [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium"], [:name, "/Rare"]]]]
+      [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]]
     )
 
     result = NuObjectParser.new.parse('[/Type/Color/Medium/Rare]')
     expect(result).to eq(
-      [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium"], [:name, "/Rare"]]]]
+      [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]]
     )
   end
 
@@ -99,51 +103,51 @@
     )
     result = NuObjectParser.new.parse(names_str)
     expect(result).to eq([
-        [:name, "/Name1"],
-        [:name, "/ASomewhatLongerName"],
-        [:name, "/A;Name_With-Various***Characters?"],
-        [:name, "/1.2"],
-        [:name, "/$$"],
-        [:name, "/@pattern"],
-        [:name, "/.notdef"],
-        [:name, "/Adobe Green"],
-        [:name, "/PANTONE 5757 CV"],
-        [:name, "/paired()parentheses"],
-        [:name, "/The_Key_of_F#_Minor"],
-        [:name, "/AB"],
-        [:name, "/"]
+      [:name, '/Name1'],
+      [:name, '/ASomewhatLongerName'],
+      [:name, '/A;Name_With-Various***Characters?'],
+      [:name, '/1.2'],
+      [:name, '/$$'],
+      [:name, '/@pattern'],
+      [:name, '/.notdef'],
+      [:name, '/Adobe Green'],
+      [:name, '/PANTONE 5757 CV'],
+      [:name, '/paired()parentheses'],
+      [:name, '/The_Key_of_F#_Minor'],
+      [:name, '/AB'],
+      [:name, '/']
     ])
   end
 
   it 'handles string escapes' do
-    result = NuObjectParser.new.parse("(Foo \\(with some bars\\))")
+    result = NuObjectParser.new.parse('(Foo \\(with some bars\\))')
     expect(result).to eq(
-      ["Foo (with some bars)"]
+      ['Foo (with some bars)']
     )
   end
 
   it 'handles paired braces in strings escapes' do
-    result = NuObjectParser.new.parse("(Foo () bar and (baz))")
+    result = NuObjectParser.new.parse('(Foo () bar and (baz))')
     expect(result).to eq(
-      ["Foo (with some bars)"]
+      ['Foo (with some bars)']
     )
   end
 
   it 'detects an unterminated string' do
     expect {
-      NuObjectParser.new.parse("(Hello there")
+      NuObjectParser.new.parse('(Hello there')
     }.to raise_error(/did not terminate/)
   end
 
   it 'detects an unterminated array' do
     expect {
-      NuObjectParser.new.parse("[")
+      NuObjectParser.new.parse('[')
     }.to raise_error(/did not terminate/)
   end
 
   it 'detects an unterminated dictionary' do
     expect {
-      NuObjectParser.new.parse("<< /Ohai")
+      NuObjectParser.new.parse('<< /Ohai')
     }.to raise_error(/did not terminate/)
   end
 
@@ -164,5 +168,4 @@
       end
     end
   end
-
 end

From 91af38361ea68420e7366cc8066a863e608e0d39 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Thu, 14 Jun 2018 16:36:12 +0200
Subject: [PATCH 11/18] Meh

---
 spec/parsers/pdf_parser/nu_object_parser.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb
index ffcad3ba..55d814a0 100644
--- a/spec/parsers/pdf_parser/nu_object_parser.rb
+++ b/spec/parsers/pdf_parser/nu_object_parser.rb
@@ -130,7 +130,7 @@ def parse_string(_start_pattern)
     # is a valid string. Needs attention.
     rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped )
     raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string
-    rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
+    rest_of_string[1..-2].gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
       STRING_ESCAPES[match] || ''
     end
   end

From b268fbebd4e2773967dc5b67c5e376a76e7b5a2a Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Fri, 15 Jun 2018 11:53:54 +0200
Subject: [PATCH 12/18] Getting there

---
 lib/parsers/pdf_parser.rb                     | 21 +++--
 spec/parsers/pdf_parser/nu_object_parser.rb   | 78 ++++++++++++++++---
 spec/parsers/pdf_parser/object_parser_spec.rb | 30 ++++---
 3 files changed, 97 insertions(+), 32 deletions(-)

diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
index 9510236b..0ff515ca 100644
--- a/lib/parsers/pdf_parser.rb
+++ b/lib/parsers/pdf_parser.rb
@@ -14,7 +14,7 @@ class FormatParser::PDFParser
   # this. The only way of solving this correctly is by adding
   # different types of PDF's in the specs.
   #
-  EOF_MARKER    = '%EOF'
+  EOF_MARKER = '%EOF'
 
   def call(io)
     io = FormatParser::IOConstraint.new(io)
@@ -22,7 +22,7 @@ def call(io)
     return unless safe_read(io, 9) =~ PDF_MARKER
 
     io.seek(io.size - 5)
-#    return unless safe_read(io, 5) == '%%EOF'
+    #    return unless safe_read(io, 5) == '%%EOF'
 
     xref_offset = locate_xref_table_offset(io)
     return unless xref_offset
@@ -43,18 +43,17 @@ def call(io)
       # Then we need to actually go in, read the object and parse the dictionary - luckily
       # this is not that much trouble and we can read the entire object, since it is small.
       # So let's get at it.
-      next if xref.length_limit > 1024 # Skip objects which are too large, they won't be headers anyway 
+      next if xref.length_limit > 1024 # Skip objects which are too large, they won't be headers anyway
 
       # Do a quickie detection reading just a tiny piece of the object
       obj_header = safe_read(io, 32)
-      if obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog')
-        io.seek(xref.offset)
-        object_buf = io.read(xref.length_limit)
-        parse_object_with_dictionary(object_buf)
-      end
+      next unless obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog')
+      io.seek(xref.offset)
+      object_buf = io.read(xref.length_limit)
+      parse_object_with_dictionary(object_buf)
     end
 
-    raise "nope"
+    raise 'nope'
     FormatParser::Document.new(
       format: :pdf,
       page_count: attributes[:page_count]
@@ -107,7 +106,7 @@ def parse_xref_table(io)
     end
 
     # Reject all disabled objects
-    xref_table.reject! {|e| e.entry_type == 'f' }
+    xref_table.reject! { |e| e.entry_type == 'f' }
 
     # Sort sequentially in ascending offset in document order
     xref_table.sort_by!(&:offset)
@@ -153,7 +152,7 @@ def max(*of_items)
   end
 
   def parse_object_with_dictionary(str)
-    File.open(Digest::SHA1.hexdigest(str) + '.pdfobj', 'wb') {|f| f << str }
+    File.open(Digest::SHA1.hexdigest(str) + '.pdfobj', 'wb') { |f| f << str }
   end
 
   FormatParser.register_parser self, natures: :document, formats: :pdf
diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb
index 55d814a0..659be531 100644
--- a/spec/parsers/pdf_parser/nu_object_parser.rb
+++ b/spec/parsers/pdf_parser/nu_object_parser.rb
@@ -27,9 +27,9 @@ class NuObjectParser
     /\d+ \d+ R/ => :parse_ref,
     NAME_RE => :parse_pdf_name,
 
-    RE['true']  => :wrap,
-    RE['false'] => :wrap,
-    RE['null']  => :wrap,
+    RE['true']  => :wrap_lit,
+    RE['false'] => :wrap_lit,
+    RE['null']  => :wrap_lit,
 
     # 34.5 −3.62 +123.6 4. −.002 0.0 are all valid reals
     /(\-|\+?)(\d+)\.(\d+)/ => :wrap_real,
@@ -80,7 +80,7 @@ def wrap_whitespace(pattern)
     [:whitespace, nil]
   end
 
-  def wrap(pattern)
+  def wrap_lit(pattern)
     [:lit, @sc.scan(pattern).to_sym]
   end
 
@@ -124,15 +124,36 @@ def parse_hex_string(_start_pattern)
     [:hex_string, hex_str]
   end
 
-  def parse_string(_start_pattern)
+  def parse_string(opening_brace_pattern)
     # This is murder. PDF allows paired braces to be put into a string literal
     # without any escaping. This means that "(Horrible file format (with a cherry on top))"
     # is a valid string. Needs attention.
-    rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped )
-    raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string
-    rest_of_string[1..-2].gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
+    @sc.scan(opening_brace_pattern) # just the "("
+    str = ""
+    count = 1
+    bytes_remaining_to_scan.times do
+      break if @sc.eos? || count == 0
+
+      byte = @sc.scan(/./)
+      if byte.nil?
+        count = 0 # unbalanced parens
+      elsif byte == 0x5C.chr # "\"
+        str << byte << @sc.scan(/\./).to_s
+      elsif byte == 0x28.chr # "("
+        str << "("
+        count += 1
+      elsif byte == 0x29.chr # ")"
+        count -= 1
+        str << ")" unless count == 0
+      else
+        str << byte unless count == 0
+      end
+      break if count == 0
+    end
+    unescaped = str.gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
       STRING_ESCAPES[match] || ''
     end
+    [:str, unescaped]
   end
 
   def parse_pdf_name(start_pattern)
@@ -148,13 +169,17 @@ def garbage(*)
     raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one"
   end
 
+  def bytes_remaining_to_scan
+    @sc.string.bytesize - @sc.pos
+  end
+
   def walk_scanner(halt_at_pattern)
     # Limit the iterations to AT MOST (!) once per
     # remaining byte to parse. This ensures we won't
     # have parsing enter an infinite loop where we expect
     # the string scanner to have advanced at least a byte forward
     # but it would sit on the same offset indifinitely.
-    (@sc.string.bytesize - @sc.pos).times do
+    bytes_remaining_to_scan.times do
       # Terminate if EOS reached
       break if @sc.eos?
 
@@ -173,13 +198,46 @@ def walk_scanner(halt_at_pattern)
     end
   end
 
-  def parse(str)
+  def tokenize(str)
     @sc = StringScanner.new(str)
     @token_stream = []
     walk_scanner(_stop_at_pattern = nil)
     @token_stream
   end
 
+  class PDFRef < Struct.new(:object_id, :object_gen)
+    def initialize(str)
+      super(*str.scan(/(\d+) (\d+) R/).first)
+    end
+  end
+
+  class PDFName < Struct.new(:name)
+  end
+
+  def parse(str)
+    ast = tokenize(str)
+    unwrap_token = ->(token) {
+      if token.length == 2 && token.first.is_a?(Symbol)
+        token_type, token_value = token
+        case token_type
+        when :dict
+          unwrapped_values = token_value.map(&unwrap_token)
+          keys, values = unwrapped_values.partition.with_index {|_, i| i % 2 == 0 }
+          Hash[keys.zip(values)]
+        when :array
+          token_value.map(&unwrap_token)
+        when :name
+          PDFName.new(token_value)
+        when :lit
+          {:true => true, :false => false, :null => nil}.fetch(token_value)
+        end
+      else
+        token
+      end
+    }
+    unwrap_token.(ast)
+  end
+
   def debug
     warn(yield)
   end
diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb
index aad2997a..a97a0c2d 100644
--- a/spec/parsers/pdf_parser/object_parser_spec.rb
+++ b/spec/parsers/pdf_parser/object_parser_spec.rb
@@ -25,7 +25,12 @@
               :dict, [
                 [:name, '/Name'], 'Jim',
                 [:name, '/Age'], [:int, 39],
-                [:name, '/Children'], [:array, ['Heather', 'Timothy', 'Rebecca']]
+                [:name, '/Children'],
+                [:array, [
+                    [:str, 'Heather'],
+                    [:str, 'Timothy'],
+                    [:str, 'Rebecca']
+                ]]
               ]
             ],
             [:int, 22],
@@ -119,17 +124,20 @@
     ])
   end
 
-  it 'handles string escapes' do
-    result = NuObjectParser.new.parse('(Foo \\(with some bars\\))')
+  it 'handles paired braces and strings escapes' do
+    result = NuObjectParser.new.parse('
+      (Foo \\(with some bars\\))
+      (Foo () bar and (baz))
+      (Foo (with some bars))
+      (((())))
+    ')
     expect(result).to eq(
-      ['Foo (with some bars)']
-    )
-  end
-
-  it 'handles paired braces in strings escapes' do
-    result = NuObjectParser.new.parse('(Foo () bar and (baz))')
-    expect(result).to eq(
-      ['Foo (with some bars)']
+      [
+        [:str, "Foo (with some bars)"],
+        [:str, "Foo () bar and (baz)"],
+        [:str, "Foo (with some bars)"],
+        [:str, "((()))"]
+      ]
     )
   end
 

From db3fd70ab0eed9898484efdb02b4e9751cc6bd25 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Fri, 15 Jun 2018 12:56:52 +0200
Subject: [PATCH 13/18] Move things to all the right places

---
 lib/parsers/pdf_parser.rb                     |  23 ++--
 .../parsers/pdf_parser/tokenizer.rb           | 112 +++++-------------
 lib/parsers/pdf_parser/transformer.rb         | 107 +++++++++++++++++
 ...bject_parser_spec.rb => tokenizer_spec.rb} |  92 +++++++-------
 4 files changed, 197 insertions(+), 137 deletions(-)
 rename spec/parsers/pdf_parser/nu_object_parser.rb => lib/parsers/pdf_parser/tokenizer.rb (64%)
 create mode 100644 lib/parsers/pdf_parser/transformer.rb
 rename spec/parsers/pdf_parser/{object_parser_spec.rb => tokenizer_spec.rb} (58%)

diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
index 0ff515ca..c2e61be6 100644
--- a/lib/parsers/pdf_parser.rb
+++ b/lib/parsers/pdf_parser.rb
@@ -1,4 +1,6 @@
 class FormatParser::PDFParser
+  require_relative 'pdf_parser/tokenizer'
+  require_relative 'pdf_parser/transformer'
   include FormatParser::IOUtils
 
   # First 9 bytes of a PDF should be in this format, according to:
@@ -50,7 +52,7 @@ def call(io)
       next unless obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog')
       io.seek(xref.offset)
       object_buf = io.read(xref.length_limit)
-      parse_object_with_dictionary(object_buf)
+      parse_pdf_object(object_buf)
     end
 
     raise 'nope'
@@ -130,19 +132,20 @@ def pairwise(enum)
     end
   end
 
-  def read_until_linebreak(io, char_limit: 32)
+  def read_until_delimiter(io, delimiter:, char_limit: 32)
     buf = StringIO.new(''.b)
     char_limit.times do
       char = safe_read(io, 1).force_encoding(Encoding::BINARY)
-      if char == "\n"
-        break
-      else
-        buf << char
-      end
+      buf << char
+      break if buf.string.end_with?(delimiter)
     end
     buf.string.strip
   end
 
+  def read_until_linebreak(io, char_limit: 32)
+    read_until_delimiter(io, delimiter: "\n", char_limit: char_limit)
+  end
+
   def min(*of_items)
     of_items.sort.shift
   end
@@ -151,8 +154,10 @@ def max(*of_items)
     of_items.sort.pop
   end
 
-  def parse_object_with_dictionary(str)
-    File.open(Digest::SHA1.hexdigest(str) + '.pdfobj', 'wb') { |f| f << str }
+  def parse_pdf_object(str)
+    token_stream = Tokenizer.new.tokenize(str)
+    tree = Transformer.new.transform(token_stream)
+    $stderr.puts tree.inspect
   end
 
   FormatParser.register_parser self, natures: :document, formats: :pdf
diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/lib/parsers/pdf_parser/tokenizer.rb
similarity index 64%
rename from spec/parsers/pdf_parser/nu_object_parser.rb
rename to lib/parsers/pdf_parser/tokenizer.rb
index 659be531..c95ee33f 100644
--- a/spec/parsers/pdf_parser/nu_object_parser.rb
+++ b/lib/parsers/pdf_parser/tokenizer.rb
@@ -1,4 +1,4 @@
-class NuObjectParser
+class FormatParser::PDFParser::Tokenizer
   Malformed = Class.new(RuntimeError)
   RE = ->(str) { /#{Regexp.escape(str)}/ }
 
@@ -23,7 +23,7 @@ class NuObjectParser
     RE['<<'] => :parse_dictionary,
     RE['[']  => :parse_array,
     RE['(']  => :parse_string,
-    RE['<']  => :parse_hex_string,
+    /<[0-9a-f]+>/i  => :parse_hex_string,
     /\d+ \d+ R/ => :parse_ref,
     NAME_RE => :parse_pdf_name,
 
@@ -37,42 +37,24 @@ class NuObjectParser
     /(\-|\+?)\.(\d+)/ => :wrap_real,
     /\-?(\d+)/ => :wrap_int,
 
-    RE['obj']       => :wrap,
-    RE['endobj']    => :wrap,
-    RE['stream']    => :wrap,
-    RE['endstream'] => :wrap,
+    RE['obj']       => :wrap_lit,
+    # Use dirty trick to stop parsing if we encounter anything binary. This does not
+    # prevent us from reading ahead into the stream, but it does allow is to abort
+    # quicker
+    RE['endobj']    => :abort,
+    RE['stream']    => :abort,
+    RE['endstream'] => :abort,
 
     /\s+/           => :wrap_whitespace,
     /./             => :garbage,
   }
 
-  # Permitted character escapes. There aren't _that_ many so we can use a table
-  STRING_ESCAPES = {
-    "\r"   => "\n",
-    "\n\r" => "\n",
-    "\r\n" => "\n",
-    '\\n'  => "\n",
-    '\\r'  => "\r",
-    '\\t'  => "\t",
-    '\\b'  => "\b",
-    '\\f'  => "\f",
-    '\\('  => '(',
-    '\\)'  => ')',
-    '\\\\' => '\\',
-    "\\\n" => '',
-  }
-
-  # Octal character escapes that look like \001 etc
-  0.upto(9)   { |n| STRING_ESCAPES['\\00' + n.to_s] = ('00' + n.to_s).oct.chr }
-  0.upto(99)  { |n| STRING_ESCAPES['\\0' + n.to_s]  = ('0' + n.to_s).oct.chr }
-  0.upto(377) { |n| STRING_ESCAPES['\\' + n.to_s]   = n.to_s.oct.chr }
-
   def wrap_real(pattern)
-    [:real, @sc.scan(pattern).to_f]
+    [:real, @sc.scan(pattern)]
   end
 
   def wrap_int(pattern)
-    [:int, @sc.scan(pattern).to_i]
+    [:int, @sc.scan(pattern)]
   end
 
   def wrap_whitespace(pattern)
@@ -115,13 +97,8 @@ def parse_dictionary(start_pattern)
     [:dict, dict_items]
   end
 
-  def parse_hex_string(_start_pattern)
-    str = @sc.scan(/<[0-9a-f]+>/i)
-    raise Malformed, "Malformed hex string at #{@sc.pos}" unless str
-
-    str << '0' unless str.bytesize.even?
-    hex_str = str.scan(/../).map { |i| i.hex.chr }.join
-    [:hex_string, hex_str]
+  def parse_hex_string(start_pattern)
+    [:hex_string, @sc.scan(start_pattern)]
   end
 
   def parse_string(opening_brace_pattern)
@@ -132,6 +109,7 @@ def parse_string(opening_brace_pattern)
     str = ""
     count = 1
     bytes_remaining_to_scan.times do
+      # Terminate if EOS reached or once we encountered the outermost closing brace
       break if @sc.eos? || count == 0
 
       byte = @sc.scan(/./)
@@ -150,19 +128,12 @@ def parse_string(opening_brace_pattern)
       end
       break if count == 0
     end
-    unescaped = str.gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
-      STRING_ESCAPES[match] || ''
-    end
-    [:str, unescaped]
+    raise Malformed, "String did not terminate at #{@sc.pos}" if count > 0
+    [:str, str]
   end
 
   def parse_pdf_name(start_pattern)
-    name = @sc.scan(start_pattern)
-    # Replace #023 hex codes with the corresponding chars
-    name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |_hex_code|
-      $1.to_i(16).chr
-    end
-    [:name, name_sans_escapes]
+    [:name, @sc.scan(start_pattern)]
   end
 
   def garbage(*)
@@ -198,47 +169,24 @@ def walk_scanner(halt_at_pattern)
     end
   end
 
-  def tokenize(str)
-    @sc = StringScanner.new(str)
-    @token_stream = []
-    walk_scanner(_stop_at_pattern = nil)
-    @token_stream
+  # Dirty thing we use to stop parsing as soon as we encounter a "stream", "xstream"
+  def abort(pattern)
+    str = @sc.scan(pattern)
+    debug { "X: Aborting tokenization at #{str.inspect} @#{@sc.pos}" }
+    throw :_abort_
   end
 
-  class PDFRef < Struct.new(:object_id, :object_gen)
-    def initialize(str)
-      super(*str.scan(/(\d+) (\d+) R/).first)
+  def tokenize(str, verbose: false)
+    @verbose = verbose
+    @sc = StringScanner.new(str.force_encoding(Encoding::BINARY))
+    @token_stream = []
+    catch :_abort_ do
+      walk_scanner(_stop_at_pattern = nil)
     end
-  end
-
-  class PDFName < Struct.new(:name)
-  end
-
-  def parse(str)
-    ast = tokenize(str)
-    unwrap_token = ->(token) {
-      if token.length == 2 && token.first.is_a?(Symbol)
-        token_type, token_value = token
-        case token_type
-        when :dict
-          unwrapped_values = token_value.map(&unwrap_token)
-          keys, values = unwrapped_values.partition.with_index {|_, i| i % 2 == 0 }
-          Hash[keys.zip(values)]
-        when :array
-          token_value.map(&unwrap_token)
-        when :name
-          PDFName.new(token_value)
-        when :lit
-          {:true => true, :false => false, :null => nil}.fetch(token_value)
-        end
-      else
-        token
-      end
-    }
-    unwrap_token.(ast)
+    @token_stream
   end
 
   def debug
-    warn(yield)
+    warn(yield) if @verbose
   end
 end
diff --git a/lib/parsers/pdf_parser/transformer.rb b/lib/parsers/pdf_parser/transformer.rb
new file mode 100644
index 00000000..b25a990e
--- /dev/null
+++ b/lib/parsers/pdf_parser/transformer.rb
@@ -0,0 +1,107 @@
+class FormatParser::PDFParser::Transformer
+  class PDFRef < Struct.new(:object_id, :object_gen)
+    def self.from_ref_str(str)
+      id_and_generation_str = str.scan(/(\d+) (\d+) R/).first
+      new(*id_and_generation_str.map(&:to_i))
+    end
+  end
+
+  class PDFName < Struct.new(:name)
+  end
+
+  # Permitted character escapes. There aren't _that_ many so we can use a table
+  STRING_ESCAPES = {
+    "\r"   => "\n",
+    "\n\r" => "\n",
+    "\r\n" => "\n",
+    '\\n'  => "\n",
+    '\\r'  => "\r",
+    '\\t'  => "\t",
+    '\\b'  => "\b",
+    '\\f'  => "\f",
+    '\\('  => '(',
+    '\\)'  => ')',
+    '\\\\' => '\\',
+    "\\\n" => '',
+  }
+
+  # Octal character escapes that look like \001 etc
+  0.upto(9)   { |n| STRING_ESCAPES['\\00' + n.to_s] = ('00' + n.to_s).oct.chr }
+  0.upto(99)  { |n| STRING_ESCAPES['\\0' + n.to_s]  = ('0' + n.to_s).oct.chr }
+  0.upto(377) { |n| STRING_ESCAPES['\\' + n.to_s]   = n.to_s.oct.chr }
+
+  LITERAL_VALUES = {
+    :true => true,
+    :false => false,
+    :null => nil,
+  }
+
+  def transform(tokens)
+    tokens.map {|t| unwrap(*t) }
+  end
+
+  def unwrap(token_type, token_value)
+    case token_type
+    when :dict
+      unwrap_dict(token_value)
+    when :array
+      unwrap_array(token_value)
+    when :real
+      unwrap_real(token_value)
+    when :int
+      unwrap_int(token_value)
+    when :ref
+      unwrap_ref(token_value)
+    when :name
+      unwrap_name(token_value)
+    when :lit
+      unwrap_lit(token_value)
+    else
+      token_value
+    end
+  end
+
+  def unwrap_real(value)
+    value.to_f
+  end
+
+  def unwrap_int(value)
+    value.to_i
+  end
+
+  def unwrap_dict(value)
+    unwrapped_values = value.map{|e| unwrap(*e) }
+    keys, values = unwrapped_values.partition.with_index {|_, i| i % 2 == 0 }
+    Hash[keys.zip(values)]
+  end
+
+  def unwrap_lit(value)
+    LITERAL_VALUES.fetch(value, value.to_sym)
+  end
+
+  def unwrap_ref(value)
+    PDFRef.from_ref_str(value)
+  end
+
+  def unwrap_array(value)
+    value.map {|e| unwrap(*e) }
+  end
+
+  def unwrap_hex_string(str)
+    str << '0' unless str.bytesize.even?
+    str.scan(/../).map { |i| i.hex.chr }.join
+  end
+
+  def unwrap_string(str)
+    str.gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
+      STRING_ESCAPES[match] || ''
+    end
+  end
+
+  def unwrap_name(name)
+    # Replace #0xx hex codes with the corresponding chars
+    name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |_hex_code|
+      $1.to_i(16).chr
+    end
+  end
+end
diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/tokenizer_spec.rb
similarity index 58%
rename from spec/parsers/pdf_parser/object_parser_spec.rb
rename to spec/parsers/pdf_parser/tokenizer_spec.rb
index a97a0c2d..ec45f628 100644
--- a/spec/parsers/pdf_parser/object_parser_spec.rb
+++ b/spec/parsers/pdf_parser/tokenizer_spec.rb
@@ -1,12 +1,19 @@
 require 'spec_helper'
-require_relative 'nu_object_parser'
 
-describe 'Object parser' do
+describe FormatParser::PDFParser::Tokenizer do
+  def tokenize(str)
+    FormatParser::PDFParser::Tokenizer.new.tokenize(str)
+  end
+
+  def tokenize_file_at(at_path)
+    FormatParser::PDFParser::Tokenizer.new.tokenize(File.read(at_path))
+  end
+
   describe 'with extracted objects from corpus' do
     fixture_paths = Dir.glob(__dir__ + '/*.pdfobj').sort
     fixture_paths.each do |path|
       it "scans #{File.basename(path)}" do
-        result = NuObjectParser.new.parse(File.read(path))
+        result = tokenize_file_at(path)
         require 'pp'
         pp result
       end
@@ -14,9 +21,7 @@
   end
 
   it 'scans the example object from the PDF presentation' do
-    obj = File.read(__dir__ + '/example_a.pdfobj')
-    parser = NuObjectParser.new
-    result = parser.parse(obj)
+    result = tokenize_file_at(__dir__ + '/example_a.pdfobj')
     expect(result).to eq(
       [
         [
@@ -42,51 +47,51 @@
   end
 
   it 'scans a simple dictionary with strings and ints as values' do
-    result = NuObjectParser.new.parse('<</Name (Jim) /Age 25>>')
+    result = tokenize('<</Name (Jim) /Age 25>>')
     expect(result).to eq(
-      [[:dict, [[:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 25]]]]
+      [[:dict, [[:name, "/Name"], [:str, "Jim"], [:name, "/Age"], [:int, "25"]]]]
     )
   end
 
   it 'scans a simple dictionary with arbitrary whitespace' do
-    result = NuObjectParser.new.parse('<<
+    result = tokenize('<<
       /Name
         (Jim)
       /Age
         25>>')
     expect(result).to eq(
-      [[:dict, [[:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 25]]]]
+      [[:dict, [[:name, "/Name"], [:str, "Jim"], [:name, "/Age"], [:int, "25"]]]]
     )
   end
 
   it 'parses all kinds of reals' do
-    result = NuObjectParser.new.parse('34.5 -3.62 +123.6 4. -.002 0.0')
+    result = tokenize('34.5 -3.62 +123.6 4. -.002 0.0')
     expect(result).to eq(
-      [[:real, 34.5], [:real, -3.62], [:real, 123.6], [:real, 4.0], [:real, -0.002], [:real, 0.0]]
+      [[:real, "34.5"], [:real, "-3.62"], [:real, "+123.6"], [:real, "4."], [:real, "-.002"], [:real, "0.0"]]
     )
   end
 
   it 'parses an array of integers' do
-    result = NuObjectParser.new.parse('[1 2 3 4]')
+    result = tokenize('[1 2 3 4]')
     expect(result).to eq(
-      [[:array, [[:int, 1], [:int, 2], [:int, 3], [:int, 4]]]]
+      [[:array, [[:int, "1"], [:int, "2"], [:int, "3"], [:int, "4"]]]]
     )
   end
 
   it 'scans an array of integers with one object ref in the middle' do
-    result = NuObjectParser.new.parse('[1 20 00 R 3]')
+    result = tokenize('[1 20 00 R 3]')
     expect(result).to eq(
-      [[:array, [[:int, 1], [:ref, '20 00 R'], [:int, 3]]]]
+      [[:array, [[:int, "1"], [:ref, "20 00 R"], [:int, "3"]]]]
     )
   end
 
   it 'scans an array of names' do
-    result = NuObjectParser.new.parse('[ /Type /Color /Medium/Rare ]')
+    result = tokenize('[ /Type /Color /Medium/Rare ]')
     expect(result).to eq(
       [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]]
     )
 
-    result = NuObjectParser.new.parse('[/Type/Color/Medium/Rare]')
+    result = tokenize('[/Type/Color/Medium/Rare]')
     expect(result).to eq(
       [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]]
     )
@@ -106,72 +111,67 @@
       /A#42
       /
     )
-    result = NuObjectParser.new.parse(names_str)
+    result = tokenize(names_str)
     expect(result).to eq([
-      [:name, '/Name1'],
-      [:name, '/ASomewhatLongerName'],
-      [:name, '/A;Name_With-Various***Characters?'],
-      [:name, '/1.2'],
-      [:name, '/$$'],
-      [:name, '/@pattern'],
-      [:name, '/.notdef'],
-      [:name, '/Adobe Green'],
-      [:name, '/PANTONE 5757 CV'],
-      [:name, '/paired()parentheses'],
-      [:name, '/The_Key_of_F#_Minor'],
-      [:name, '/AB'],
-      [:name, '/']
+      [:name, "/Name1"],
+      [:name, "/ASomewhatLongerName"],
+      [:name, "/A;Name_With-Various***Characters?"],
+      [:name, "/1.2"],
+      [:name, "/$$"],
+      [:name, "/@pattern"],
+      [:name, "/.notdef"],
+      [:name, "/Adobe#20Green"],
+      [:name, "/PANTONE#205757#20CV"],
+      [:name, "/paired#28#29parentheses"],
+      [:name, "/The_Key_of_F#23_Minor"],
+      [:name, "/A#42"],
+      [:name, "/"]
     ])
   end
 
   it 'handles paired braces and strings escapes' do
-    result = NuObjectParser.new.parse('
+    result = tokenize('
       (Foo \\(with some bars\\))
       (Foo () bar and (baz))
       (Foo (with some bars))
       (((())))
     ')
     expect(result).to eq(
-      [
-        [:str, "Foo (with some bars)"],
-        [:str, "Foo () bar and (baz)"],
-        [:str, "Foo (with some bars)"],
-        [:str, "((()))"]
-      ]
+      [[:str, "Foo \\(with some bars\\)"], [:str, "Foo () bar and (baz)"], [:str, "Foo (with some bars)"], [:str, "((()))"]]
     )
   end
 
   it 'detects an unterminated string' do
     expect {
-      NuObjectParser.new.parse('(Hello there')
+      tokenize('(Hello there')
     }.to raise_error(/did not terminate/)
   end
 
   it 'detects an unterminated array' do
     expect {
-      NuObjectParser.new.parse('[')
+      tokenize('[')
     }.to raise_error(/did not terminate/)
   end
 
   it 'detects an unterminated dictionary' do
     expect {
-      NuObjectParser.new.parse('<< /Ohai')
+      tokenize('<< /Ohai')
     }.to raise_error(/did not terminate/)
   end
 
   it 'detects a truncated dictionary opener' do
     expect {
-      NuObjectParser.new.parse('<</')
-    }.to raise_error(/PDF name at 2/)
+      tokenize('<</')
+    }.to raise_error(/Dictionary did not terminate/)
   end
 
   it 'responds well to fuzzed input' do
     random = Random.new(12345)
     1024.times do
       begin
-        result = NuObjectParser.new.parse(random.bytes(128))
+        result = tokenize(random.bytes(128))
         expect(result).to be_kind_of(Array)
-      rescue NuObjectParser::Malformed
+      rescue FormatParser::PDFParser::Tokenizer::Malformed
         # Everything good, we failed as we should
       end
     end

From 8b62f675011b0664195a7f13aeb6e3734c8f1283 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Fri, 15 Jun 2018 13:47:33 +0200
Subject: [PATCH 14/18] Continue

---
 lib/parsers/pdf_parser.rb | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
index c2e61be6..06a856ac 100644
--- a/lib/parsers/pdf_parser.rb
+++ b/lib/parsers/pdf_parser.rb
@@ -32,7 +32,8 @@ def call(io)
     io.seek(xref_offset)
     xref_table = parse_xref_table(io)
 
-    # return unless xref_table.any?
+    return unless xref_table.any?
+
     xref_table.each do |xref|
       io.seek(xref.offset)
       # From here on out we need to proceed as follows. We need to buffer (preemptively)
@@ -45,14 +46,18 @@ def call(io)
       # Then we need to actually go in, read the object and parse the dictionary - luckily
       # this is not that much trouble and we can read the entire object, since it is small.
       # So let's get at it.
-      next if xref.length_limit > 1024 # Skip objects which are too large, they won't be headers anyway
+      next if xref.length_limit > 1024 # Skip objects which are too large, they aren't what we are looking for anyway
+
+      # Do a quickie detection reading just a tiny piece of the object. Strictly speaking we need
+      # to parse the entire object (what if there are 9000 spaces between "/Type" and "/Pages" ?)
+      # but in practice we should be able to get away with just a few things here.
+      obj_header = safe_read(io, 64)
+      next unless obj_header.include?('/Pages') || obj_header.include?('/Catalog')
 
-      # Do a quickie detection reading just a tiny piece of the object
-      obj_header = safe_read(io, 32)
-      next unless obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog')
       io.seek(xref.offset)
+      # Reduce the length limit - we should read less of it if we can
       object_buf = io.read(xref.length_limit)
-      parse_pdf_object(object_buf)
+      extract_pdf_object_dictionary(object_buf)
     end
 
     raise 'nope'
@@ -154,10 +159,14 @@ def max(*of_items)
     of_items.sort.pop
   end
 
-  def parse_pdf_object(str)
+  def extract_pdf_object_dictionary(str)
     token_stream = Tokenizer.new.tokenize(str)
     tree = Transformer.new.transform(token_stream)
-    $stderr.puts tree.inspect
+    # Locate the first hash in the parse tree
+    first_hash = tree.find {|e| e.is_a?(Hash) }
+    $stderr.puts first_hash.inspect
+  rescue => e
+    # Malformed PDF object or our parser has failed somewhere
   end
 
   FormatParser.register_parser self, natures: :document, formats: :pdf

From b517be80e54d4fde47321312dcf3559856e346ed Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Fri, 15 Jun 2018 18:42:55 +0200
Subject: [PATCH 15/18] Strscan is required

---
 lib/parsers/pdf_parser/tokenizer.rb | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/parsers/pdf_parser/tokenizer.rb b/lib/parsers/pdf_parser/tokenizer.rb
index c95ee33f..63528735 100644
--- a/lib/parsers/pdf_parser/tokenizer.rb
+++ b/lib/parsers/pdf_parser/tokenizer.rb
@@ -1,3 +1,5 @@
+require 'strscan'
+
 class FormatParser::PDFParser::Tokenizer
   Malformed = Class.new(RuntimeError)
   RE = ->(str) { /#{Regexp.escape(str)}/ }

From fbf145e5e3554777c31e44c3034f9bc2569fd736 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Fri, 15 Jun 2018 18:43:11 +0200
Subject: [PATCH 16/18] Get rid of the special Name type we ended up not using

---
 lib/parsers/pdf_parser/transformer.rb | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/parsers/pdf_parser/transformer.rb b/lib/parsers/pdf_parser/transformer.rb
index b25a990e..7aab792a 100644
--- a/lib/parsers/pdf_parser/transformer.rb
+++ b/lib/parsers/pdf_parser/transformer.rb
@@ -6,9 +6,6 @@ def self.from_ref_str(str)
     end
   end
 
-  class PDFName < Struct.new(:name)
-  end
-
   # Permitted character escapes. There aren't _that_ many so we can use a table
   STRING_ESCAPES = {
     "\r"   => "\n",

From 12ffb7209bbbac4143a8dbb9a44517771c824a42 Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Fri, 15 Jun 2018 18:44:32 +0200
Subject: [PATCH 17/18] Patch up the parser a bit

---
 lib/parsers/pdf_parser.rb | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
index 06a856ac..9a584b7e 100644
--- a/lib/parsers/pdf_parser.rb
+++ b/lib/parsers/pdf_parser.rb
@@ -32,10 +32,7 @@ def call(io)
     io.seek(xref_offset)
     xref_table = parse_xref_table(io)
 
-    return unless xref_table.any?
-
     xref_table.each do |xref|
-      io.seek(xref.offset)
       # From here on out we need to proceed as follows. We need to buffer (preemptively)
       # all the /Type/Pages objects for later. We also need to recover the
       # /Type/Catalog object which will refer us to the right /Type /Pages object to use.
@@ -51,20 +48,23 @@ def call(io)
       # Do a quickie detection reading just a tiny piece of the object. Strictly speaking we need
       # to parse the entire object (what if there are 9000 spaces between "/Type" and "/Pages" ?)
       # but in practice we should be able to get away with just a few things here.
-      obj_header = safe_read(io, 64)
-      next unless obj_header.include?('/Pages') || obj_header.include?('/Catalog')
+      io.seek(xref.offset)
+      obj_header = io.read(64).to_s
+      next unless obj_header.include?('/Pages') || obj_header.include?('/Linearized')
 
+      # Seek to that object and read it whole, to the length limit or 1024 bytes whichever is lower
       io.seek(xref.offset)
-      # Reduce the length limit - we should read less of it if we can
-      object_buf = io.read(xref.length_limit)
-      extract_pdf_object_dictionary(object_buf)
+      object_buf = io.read(min(1024, xref.length_limit))
+      dict = extract_pdf_object_dictionary(object_buf)
+      if dict['/Type'] == '/Pages' && dict['/Count']
+        return FormatParser::Document.new(format: :pdf, page_count: dict['/Count'])
+      elsif dict['/Linearized'] && dict['/N']
+        return FormatParser::Document.new(format: :pdf, page_count: dict['/N'])
+      end
     end
 
-    raise 'nope'
-    FormatParser::Document.new(
-      format: :pdf,
-      page_count: attributes[:page_count]
-    )
+    # We could not determine page count
+    FormatParser::Document.new(format: :pdf)
   end
 
   def locate_xref_table_offset(io)
@@ -140,9 +140,8 @@ def pairwise(enum)
   def read_until_delimiter(io, delimiter:, char_limit: 32)
     buf = StringIO.new(''.b)
     char_limit.times do
-      char = safe_read(io, 1).force_encoding(Encoding::BINARY)
-      buf << char
-      break if buf.string.end_with?(delimiter)
+      buf << safe_read(io, 1).force_encoding(Encoding::BINARY)
+      break if buf.string.end_with?(delimiter) || buf.string.bytesize >= char_limit
     end
     buf.string.strip
   end
@@ -162,11 +161,13 @@ def max(*of_items)
   def extract_pdf_object_dictionary(str)
     token_stream = Tokenizer.new.tokenize(str)
     tree = Transformer.new.transform(token_stream)
-    # Locate the first hash in the parse tree
+    # Locate the first hash (dictionary) in the parse tree
     first_hash = tree.find {|e| e.is_a?(Hash) }
-    $stderr.puts first_hash.inspect
+    first_hash || {}
   rescue => e
+    $stderr.puts e
     # Malformed PDF object or our parser has failed somewhere
+    {}
   end
 
   FormatParser.register_parser self, natures: :document, formats: :pdf

From dc9e2a3fe289e7d00ebb4a7043e9f329f3d7822c Mon Sep 17 00:00:00 2001
From: Julik Tarkhanov <me@julik.nl>
Date: Sat, 16 Jun 2018 00:40:14 +0200
Subject: [PATCH 18/18] Make spec titles a bit neater

---
 spec/parsers/pdf_parser_spec.rb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spec/parsers/pdf_parser_spec.rb b/spec/parsers/pdf_parser_spec.rb
index 2a99f590..33fe477e 100644
--- a/spec/parsers/pdf_parser_spec.rb
+++ b/spec/parsers/pdf_parser_spec.rb
@@ -13,13 +13,14 @@
   shared_examples :behave_like_pdf do |hash|
     let(:pdf_file) { hash.fetch(:file) }
 
-    it 'acts as a pdf' do
+    it 'is recognized as PDF' do
       expect(parsed_pdf).not_to be_nil
       expect(parsed_pdf.nature).to eq(:document)
       expect(parsed_pdf.format).to eq(:pdf)
     end
 
     it 'has a correct page count' do
+      expect(parsed_pdf).not_to be_nil
       expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count))
     end
   end