diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb
index f562dcbd..9a584b7e 100644
--- a/lib/parsers/pdf_parser.rb
+++ b/lib/parsers/pdf_parser.rb
@@ -1,4 +1,6 @@
 class FormatParser::PDFParser
+  require_relative 'pdf_parser/tokenizer'
+  require_relative 'pdf_parser/transformer'
   include FormatParser::IOUtils
 
   # First 9 bytes of a PDF should be in this format, according to:
@@ -14,62 +16,158 @@ class FormatParser::PDFParser
   # this. The only way of solving this correctly is by adding
   # different types of PDF's in the specs.
   #
-  COUNT_MARKERS = ['Count ']
-  EOF_MARKER    = '%EOF'
+  EOF_MARKER = '%EOF'
 
   def call(io)
     io = FormatParser::IOConstraint.new(io)
 
     return unless safe_read(io, 9) =~ PDF_MARKER
 
-    attributes = scan_for_attributes(io)
+    io.seek(io.size - 5)
+    #    return unless safe_read(io, 5) == '%%EOF'
 
-    FormatParser::Document.new(
-      format: :pdf,
-      page_count: attributes[:page_count]
-    )
-  end
+    xref_offset = locate_xref_table_offset(io)
+    return unless xref_offset
 
-  private
+    io.seek(xref_offset)
+    xref_table = parse_xref_table(io)
 
-  # Read ahead bytes until one of % or / is reached.
-  # A header in a PDF always starts with a /
-  # The % is to detect the EOF
-  #
-  def scan_for_attributes(io)
-    result = {}
-
-    while read = safe_read(io, 1)
-      case read
-      when '%'
-        break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
-      when '/'
-        find_page_count(io, result)
+    xref_table.each do |xref|
+      # From here on out we need to proceed as follows. We need to buffer (preemptively)
+      # all the /Type/Pages objects for later. We also need to recover the
+      # /Type/Catalog object which will refer us to the right /Type /Pages object to use.
+      # It is a good idea to scan only once, and we also should be "economical" in reading these.
+      # All the objects we care about start with the object header ("45 0 obj" etc)
+      # and then must contain an arbitrary amount of whitespace (which we scientifically
+      # followed by the dictionary open brackets - "<<".
+      # Then we need to actually go in, read the object and parse the dictionary - luckily
+      # this is not that much trouble and we can read the entire object, since it is small.
+      # So let's get at it.
+      next if xref.length_limit > 1024 # Skip objects which are too large, they aren't what we are looking for anyway
+
+      # Do a quickie detection reading just a tiny piece of the object. Strictly speaking we need
+      # to parse the entire object (what if there are 9000 spaces between "/Type" and "/Pages" ?)
+      # but in practice we should be able to get away with just a few things here.
+      io.seek(xref.offset)
+      obj_header = io.read(64).to_s
+      next unless obj_header.include?('/Pages') || obj_header.include?('/Linearized')
+
+      # Seek to that object and read it whole, to the length limit or 1024 bytes whichever is lower
+      io.seek(xref.offset)
+      object_buf = io.read(min(1024, xref.length_limit))
+      dict = extract_pdf_object_dictionary(object_buf)
+      if dict['/Type'] == '/Pages' && dict['/Count']
+        return FormatParser::Document.new(format: :pdf, page_count: dict['/Count'])
+      elsif dict['/Linearized'] && dict['/N']
+        return FormatParser::Document.new(format: :pdf, page_count: dict['/N'])
       end
     end
 
-    result
+    # We could not determine page count
+    FormatParser::Document.new(format: :pdf)
+  end
+
+  def locate_xref_table_offset(io)
+    # Read the "tail" of the PDF and find the 'startxref' declaration
+    assumed_xref_table_size = 1024
+    tail_pos = max(0, io.size - assumed_xref_table_size)
+
+    io.seek(tail_pos)
+    tail = io.read(assumed_xref_table_size)
+
+    # Find the "startxref" declaration and read the first group of integers after it
+    start_xref_index = tail.index('startxref')
+    return unless start_xref_index
+
+    startxref = tail.byteslice(start_xref_index, assumed_xref_table_size)[/\d+/]
+    return unless startxref
+
+    startxref.to_i
   end
 
-  def find_page_count(io, result)
-    COUNT_MARKERS.each do |marker|
-      if safe_read(io, marker.size) == marker
-        result[:page_count] = read_numbers(io)
+  XRef = Struct.new(:idx, :offset, :generation_number, :entry_type, :length_limit)
+
+  def parse_xref_table(io)
+    xref_table = []
+    starting_idx = 0
+    num_objects_cross_check = nil
+    while line = read_until_linebreak(io, char_limit: 32)
+      case line
+      when /xref/
+        # Starts the cross-reference table
+      when /^(\d+) (\d+)$/
+        # Defines the starting number of the object and the number of objects in the table
+        starting_idx = $1.to_i
+        num_objects_cross_check = $2.to_i
+      when /^(\d{10}) (\d{5}) (\w)$/
+        # The actual object offset. Set the length limit to a ridiculous value since we don't know it
+        xref_table << XRef.new(starting_idx + xref_table.length, $1.to_i, $2.to_i, $3, 99999999)
+      when /trailer/
+        break
       end
     end
+
+    # Check if the number of xrefs we got makes sense
+    if num_objects_cross_check && num_objects_cross_check != xref_table.length
+      raise "The xref table was declared to contain #{num_objects_cross_check} object refs but contained #{xref_table.length}"
+    end
+
+    # Reject all disabled objects
+    xref_table.reject! { |e| e.entry_type == 'f' }
+
+    # Sort sequentially in ascending offset in document order
+    xref_table.sort_by!(&:offset)
+
+    # Update the limits which will tell us how much we need to read to have the entire object
+    pairwise(xref_table) do |xref_a, xref_b|
+      xref_a.length_limit = xref_b.offset - xref_a.offset
+    end
+
+    xref_table
   end
 
-  # Read ahead bytes until no more numbers are found
-  # This assumes that the position of io starts at a
-  # number
-  def read_numbers(io)
-    numbers = ''
+  def pairwise(enum)
+    pair = []
+    enum.each do |e|
+      pair << e
+      if pair.length == 2
+        yield(pair.first, pair.last)
+        pair.shift
+      end
+    end
+  end
 
-    while c = safe_read(io, 1)
-      c =~ /\d+/ ? numbers << c : break
+  def read_until_delimiter(io, delimiter:, char_limit: 32)
+    buf = StringIO.new(''.b)
+    char_limit.times do
+      buf << safe_read(io, 1).force_encoding(Encoding::BINARY)
+      break if buf.string.end_with?(delimiter) || buf.string.bytesize >= char_limit
     end
+    buf.string.strip
+  end
+
+  def read_until_linebreak(io, char_limit: 32)
+    read_until_delimiter(io, delimiter: "\n", char_limit: char_limit)
+  end
+
+  def min(*of_items)
+    of_items.sort.shift
+  end
+
+  def max(*of_items)
+    of_items.sort.pop
+  end
 
-    numbers.to_i
+  def extract_pdf_object_dictionary(str)
+    token_stream = Tokenizer.new.tokenize(str)
+    tree = Transformer.new.transform(token_stream)
+    # Locate the first hash (dictionary) in the parse tree
+    first_hash = tree.find {|e| e.is_a?(Hash) }
+    first_hash || {}
+  rescue => e
+    $stderr.puts e
+    # Malformed PDF object or our parser has failed somewhere
+    {}
   end
 
   FormatParser.register_parser self, natures: :document, formats: :pdf
diff --git a/lib/parsers/pdf_parser/tokenizer.rb b/lib/parsers/pdf_parser/tokenizer.rb
new file mode 100644
index 00000000..63528735
--- /dev/null
+++ b/lib/parsers/pdf_parser/tokenizer.rb
@@ -0,0 +1,194 @@
+require 'strscan'
+
+class FormatParser::PDFParser::Tokenizer
+  Malformed = Class.new(RuntimeError)
+  RE = ->(str) { /#{Regexp.escape(str)}/ }
+
+  NAME_RE = begin
+    # The ASCII subset permissible for PDF name values
+    printable_ascii = (32..126).to_a
+    printable_ascii.delete(' '.ord)
+    printable_ascii.delete('['.ord)
+    printable_ascii.delete(']'.ord)
+    printable_ascii.delete('<'.ord)
+    printable_ascii.delete('>'.ord)
+    printable_ascii.delete('('.ord)
+    printable_ascii.delete(')'.ord)
+    printable_ascii.delete('/'.ord)
+    printable_ascii.delete('\\'.ord)
+    exact_char_class = printable_ascii.map(&:chr).join
+
+    /\/[#{exact_char_class}]{0,}/
+  end
+
+  STRATEGIES = {
+    RE['<<'] => :parse_dictionary,
+    RE['[']  => :parse_array,
+    RE['(']  => :parse_string,
+    /<[0-9a-f]+>/i  => :parse_hex_string,
+    /\d+ \d+ R/ => :parse_ref,
+    NAME_RE => :parse_pdf_name,
+
+    RE['true']  => :wrap_lit,
+    RE['false'] => :wrap_lit,
+    RE['null']  => :wrap_lit,
+
+    # 34.5 −3.62 +123.6 4. −.002 0.0 are all valid reals
+    /(\-|\+?)(\d+)\.(\d+)/ => :wrap_real,
+    /(\-|\+?)(\d+)\./ => :wrap_real,
+    /(\-|\+?)\.(\d+)/ => :wrap_real,
+    /\-?(\d+)/ => :wrap_int,
+
+    RE['obj']       => :wrap_lit,
+    # Use dirty trick to stop parsing if we encounter anything binary. This does not
+    # prevent us from reading ahead into the stream, but it does allow is to abort
+    # quicker
+    RE['endobj']    => :abort,
+    RE['stream']    => :abort,
+    RE['endstream'] => :abort,
+
+    /\s+/           => :wrap_whitespace,
+    /./             => :garbage,
+  }
+
+  def wrap_real(pattern)
+    [:real, @sc.scan(pattern)]
+  end
+
+  def wrap_int(pattern)
+    [:int, @sc.scan(pattern)]
+  end
+
+  def wrap_whitespace(pattern)
+    @sc.scan(pattern)
+    [:whitespace, nil]
+  end
+
+  def wrap_lit(pattern)
+    [:lit, @sc.scan(pattern).to_sym]
+  end
+
+  def consume!(pattern, method_name)
+    at = @sc.pos
+    return false unless @sc.check(pattern)
+    debug { "M: #{method_name} @#{at}: 8 chars after scan pointer #{@sc.peek(8).inspect}" }
+    result = send(method_name, pattern)
+    @token_stream << result unless result == [:whitespace, nil]
+    true
+  end
+
+  def parse_ref(start_pattern)
+    [:ref, @sc.scan(start_pattern)]
+  end
+
+  def parse_array(start_pattern)
+    @sc.scan(start_pattern) # consume [
+    dict_open_at = @token_stream.length
+    walk_scanner(RE[']'])
+    raise Malformed, 'Array did not terminate' unless @token_stream.pop == :terminator
+    array_items = @token_stream.pop(@token_stream.length - dict_open_at)
+    [:array, array_items]
+  end
+
+  def parse_dictionary(start_pattern)
+    @sc.scan(start_pattern) # consume <<
+    dict_open_at = @token_stream.length
+    walk_scanner(RE['>>'])
+    raise Malformed, 'Dictionary did not terminate' unless @token_stream.pop == :terminator
+    dict_items = @token_stream.pop(@token_stream.length - dict_open_at)
+    [:dict, dict_items]
+  end
+
+  def parse_hex_string(start_pattern)
+    [:hex_string, @sc.scan(start_pattern)]
+  end
+
+  def parse_string(opening_brace_pattern)
+    # This is murder. PDF allows paired braces to be put into a string literal
+    # without any escaping. This means that "(Horrible file format (with a cherry on top))"
+    # is a valid string. Needs attention.
+    @sc.scan(opening_brace_pattern) # just the "("
+    str = ""
+    count = 1
+    bytes_remaining_to_scan.times do
+      # Terminate if EOS reached or once we encountered the outermost closing brace
+      break if @sc.eos? || count == 0
+
+      byte = @sc.scan(/./)
+      if byte.nil?
+        count = 0 # unbalanced parens
+      elsif byte == 0x5C.chr # "\"
+        str << byte << @sc.scan(/\./).to_s
+      elsif byte == 0x28.chr # "("
+        str << "("
+        count += 1
+      elsif byte == 0x29.chr # ")"
+        count -= 1
+        str << ")" unless count == 0
+      else
+        str << byte unless count == 0
+      end
+      break if count == 0
+    end
+    raise Malformed, "String did not terminate at #{@sc.pos}" if count > 0
+    [:str, str]
+  end
+
+  def parse_pdf_name(start_pattern)
+    [:name, @sc.scan(start_pattern)]
+  end
+
+  def garbage(*)
+    raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one"
+  end
+
+  def bytes_remaining_to_scan
+    @sc.string.bytesize - @sc.pos
+  end
+
+  def walk_scanner(halt_at_pattern)
+    # Limit the iterations to AT MOST (!) once per
+    # remaining byte to parse. This ensures we won't
+    # have parsing enter an infinite loop where we expect
+    # the string scanner to have advanced at least a byte forward
+    # but it would sit on the same offset indifinitely.
+    bytes_remaining_to_scan.times do
+      # Terminate if EOS reached
+      break if @sc.eos?
+
+      # Terminate early
+      if halt_at_pattern && halted = @sc.scan(halt_at_pattern)
+        @token_stream << :terminator
+        return
+      end
+
+      # Walk through STRATEGIES and stop iterating on first non-false call to consume!
+      # STRATEGIES are arranged by order of specificity, so for most iterations
+      # somethign meaningful should be hit relatively quickly
+      STRATEGIES.find do |pattern, method_name|
+        consume!(pattern, method_name)
+      end
+    end
+  end
+
+  # Dirty thing we use to stop parsing as soon as we encounter a "stream", "xstream"
+  def abort(pattern)
+    str = @sc.scan(pattern)
+    debug { "X: Aborting tokenization at #{str.inspect} @#{@sc.pos}" }
+    throw :_abort_
+  end
+
+  def tokenize(str, verbose: false)
+    @verbose = verbose
+    @sc = StringScanner.new(str.force_encoding(Encoding::BINARY))
+    @token_stream = []
+    catch :_abort_ do
+      walk_scanner(_stop_at_pattern = nil)
+    end
+    @token_stream
+  end
+
+  def debug
+    warn(yield) if @verbose
+  end
+end
diff --git a/lib/parsers/pdf_parser/transformer.rb b/lib/parsers/pdf_parser/transformer.rb
new file mode 100644
index 00000000..7aab792a
--- /dev/null
+++ b/lib/parsers/pdf_parser/transformer.rb
@@ -0,0 +1,104 @@
+class FormatParser::PDFParser::Transformer
+  class PDFRef < Struct.new(:object_id, :object_gen)
+    def self.from_ref_str(str)
+      id_and_generation_str = str.scan(/(\d+) (\d+) R/).first
+      new(*id_and_generation_str.map(&:to_i))
+    end
+  end
+
+  # Permitted character escapes. There aren't _that_ many so we can use a table
+  STRING_ESCAPES = {
+    "\r"   => "\n",
+    "\n\r" => "\n",
+    "\r\n" => "\n",
+    '\\n'  => "\n",
+    '\\r'  => "\r",
+    '\\t'  => "\t",
+    '\\b'  => "\b",
+    '\\f'  => "\f",
+    '\\('  => '(',
+    '\\)'  => ')',
+    '\\\\' => '\\',
+    "\\\n" => '',
+  }
+
+  # Octal character escapes that look like \001 etc
+  0.upto(9)   { |n| STRING_ESCAPES['\\00' + n.to_s] = ('00' + n.to_s).oct.chr }
+  0.upto(99)  { |n| STRING_ESCAPES['\\0' + n.to_s]  = ('0' + n.to_s).oct.chr }
+  0.upto(377) { |n| STRING_ESCAPES['\\' + n.to_s]   = n.to_s.oct.chr }
+
+  LITERAL_VALUES = {
+    :true => true,
+    :false => false,
+    :null => nil,
+  }
+
+  def transform(tokens)
+    tokens.map {|t| unwrap(*t) }
+  end
+
+  def unwrap(token_type, token_value)
+    case token_type
+    when :dict
+      unwrap_dict(token_value)
+    when :array
+      unwrap_array(token_value)
+    when :real
+      unwrap_real(token_value)
+    when :int
+      unwrap_int(token_value)
+    when :ref
+      unwrap_ref(token_value)
+    when :name
+      unwrap_name(token_value)
+    when :lit
+      unwrap_lit(token_value)
+    else
+      token_value
+    end
+  end
+
+  def unwrap_real(value)
+    value.to_f
+  end
+
+  def unwrap_int(value)
+    value.to_i
+  end
+
+  def unwrap_dict(value)
+    unwrapped_values = value.map{|e| unwrap(*e) }
+    keys, values = unwrapped_values.partition.with_index {|_, i| i % 2 == 0 }
+    Hash[keys.zip(values)]
+  end
+
+  def unwrap_lit(value)
+    LITERAL_VALUES.fetch(value, value.to_sym)
+  end
+
+  def unwrap_ref(value)
+    PDFRef.from_ref_str(value)
+  end
+
+  def unwrap_array(value)
+    value.map {|e| unwrap(*e) }
+  end
+
+  def unwrap_hex_string(str)
+    str << '0' unless str.bytesize.even?
+    str.scan(/../).map { |i| i.hex.chr }.join
+  end
+
+  def unwrap_string(str)
+    str.gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
+      STRING_ESCAPES[match] || ''
+    end
+  end
+
+  def unwrap_name(name)
+    # Replace #0xx hex codes with the corresponding chars
+    name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |_hex_code|
+      $1.to_i(16).chr
+    end
+  end
+end
diff --git a/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj b/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj
new file mode 100644
index 00000000..2530644b
--- /dev/null
+++ b/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj
@@ -0,0 +1,7 @@
+44 0 obj
+<</Type/Catalog/Pages 31 0 R
+/OpenAction[1 0 R /XYZ null null 0]
+/Lang(en-US)
+>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj b/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj
new file mode 100644
index 00000000..e3ff9300
--- /dev/null
+++ b/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj
@@ -0,0 +1,7 @@
+12 0 obj
+<</Type/Catalog/Pages 4 0 R
+/OpenAction[1 0 R /XYZ null null 0]
+/Lang(en-US)
+>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj b/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj
new file mode 100644
index 00000000..c2f92e6d
--- /dev/null
+++ b/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj
@@ -0,0 +1,9 @@
+ 0 obj
+<</Type/Pages
+/Resources 11 0 R
+/MediaBox[ 0 0 595 842 ]
+/Kids[ 1 0 R ]
+endobj
+
+12 0 obj
+<</
\ No newline at end of file
diff --git a/spec/parsers/pdf_parser/82cc57eb7632f0370bc3463a3884acdf3386ed83.pdfobj b/spec/parsers/pdf_parser/82cc57eb7632f0370bc3463a3884acdf3386ed83.pdfobj
new file mode 100644
index 00000000..5659872d
--- /dev/null
+++ b/spec/parsers/pdf_parser/82cc57eb7632f0370bc3463a3884acdf3386ed83.pdfobj
@@ -0,0 +1,7 @@
+15 0 obj
+<</Type/Catalog/Pages 7 0 R
+/OpenAction[1 0 R /XYZ null null 0]
+/Lang(en-US)
+>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj b/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj
new file mode 100644
index 00000000..4fe62abd
--- /dev/null
+++ b/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj
@@ -0,0 +1,8 @@
+4 0 obj
+<</Type/Pages
+/Resources 11 0 R
+/MediaBox[ 0 0 595 842 ]
+/Kids[ 1 0 R ]
+/Count 1>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj b/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj
new file mode 100644
index 00000000..1cb8e07f
--- /dev/null
+++ b/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj
@@ -0,0 +1,8 @@
+31 0 obj
+<</Type/Pages
+/Resources 43 0 R
+/MediaBox[ 0 0 595 842 ]
+/Kids[ 1 0 R 4 0 R 7 0 R 10 0 R 13 0 R 16 0 R 19 0 R 22 0 R 25 0 R 28 0 R ]
+/Count 10>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/example_a.pdfobj b/spec/parsers/pdf_parser/example_a.pdfobj
new file mode 100644
index 00000000..6d82200b
--- /dev/null
+++ b/spec/parsers/pdf_parser/example_a.pdfobj
@@ -0,0 +1,9 @@
+[
+ <<
+ /Name (Jim)
+ /Age 39
+ /Children [(Heather) (Timothy) (Rebecca)]
+ >>
+ 22
+ 44.55
+]
\ No newline at end of file
diff --git a/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj b/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj
new file mode 100644
index 00000000..24c92285
--- /dev/null
+++ b/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj
@@ -0,0 +1,8 @@
+7 0 obj
+<</Type/Pages
+/Resources 14 0 R
+/MediaBox[ 0 0 595 842 ]
+/Kids[ 1 0 R 4 0 R ]
+/Count 2>>
+endobj
+
diff --git a/spec/parsers/pdf_parser/tokenizer_spec.rb b/spec/parsers/pdf_parser/tokenizer_spec.rb
new file mode 100644
index 00000000..ec45f628
--- /dev/null
+++ b/spec/parsers/pdf_parser/tokenizer_spec.rb
@@ -0,0 +1,179 @@
+require 'spec_helper'
+
+describe FormatParser::PDFParser::Tokenizer do
+  def tokenize(str)
+    FormatParser::PDFParser::Tokenizer.new.tokenize(str)
+  end
+
+  def tokenize_file_at(at_path)
+    FormatParser::PDFParser::Tokenizer.new.tokenize(File.read(at_path))
+  end
+
+  describe 'with extracted objects from corpus' do
+    fixture_paths = Dir.glob(__dir__ + '/*.pdfobj').sort
+    fixture_paths.each do |path|
+      it "scans #{File.basename(path)}" do
+        result = tokenize_file_at(path)
+        require 'pp'
+        pp result
+      end
+    end
+  end
+
+  it 'scans the example object from the PDF presentation' do
+    result = tokenize_file_at(__dir__ + '/example_a.pdfobj')
+    expect(result).to eq(
+      [
+        [
+          :array, [
+            [
+              :dict, [
+                [:name, '/Name'], 'Jim',
+                [:name, '/Age'], [:int, 39],
+                [:name, '/Children'],
+                [:array, [
+                    [:str, 'Heather'],
+                    [:str, 'Timothy'],
+                    [:str, 'Rebecca']
+                ]]
+              ]
+            ],
+            [:int, 22],
+            [:real, 44.55]
+          ]
+        ]
+      ]
+    )
+  end
+
+  it 'scans a simple dictionary with strings and ints as values' do
+    result = tokenize('<</Name (Jim) /Age 25>>')
+    expect(result).to eq(
+      [[:dict, [[:name, "/Name"], [:str, "Jim"], [:name, "/Age"], [:int, "25"]]]]
+    )
+  end
+
+  it 'scans a simple dictionary with arbitrary whitespace' do
+    result = tokenize('<<
+      /Name
+        (Jim)
+      /Age
+        25>>')
+    expect(result).to eq(
+      [[:dict, [[:name, "/Name"], [:str, "Jim"], [:name, "/Age"], [:int, "25"]]]]
+    )
+  end
+
+  it 'parses all kinds of reals' do
+    result = tokenize('34.5 -3.62 +123.6 4. -.002 0.0')
+    expect(result).to eq(
+      [[:real, "34.5"], [:real, "-3.62"], [:real, "+123.6"], [:real, "4."], [:real, "-.002"], [:real, "0.0"]]
+    )
+  end
+
+  it 'parses an array of integers' do
+    result = tokenize('[1 2 3 4]')
+    expect(result).to eq(
+      [[:array, [[:int, "1"], [:int, "2"], [:int, "3"], [:int, "4"]]]]
+    )
+  end
+
+  it 'scans an array of integers with one object ref in the middle' do
+    result = tokenize('[1 20 00 R 3]')
+    expect(result).to eq(
+      [[:array, [[:int, "1"], [:ref, "20 00 R"], [:int, "3"]]]]
+    )
+  end
+
+  it 'scans an array of names' do
+    result = tokenize('[ /Type /Color /Medium/Rare ]')
+    expect(result).to eq(
+      [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]]
+    )
+
+    result = tokenize('[/Type/Color/Medium/Rare]')
+    expect(result).to eq(
+      [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]]
+    )
+  end
+
+  it 'handles names' do
+    names_str = %(
+      /Name1
+      /ASomewhatLongerName /A;Name_With-Various***Characters? /1.2
+      /$$
+      /@pattern
+      /.notdef
+      /Adobe#20Green
+      /PANTONE#205757#20CV
+      /paired#28#29parentheses
+      /The_Key_of_F#23_Minor
+      /A#42
+      /
+    )
+    result = tokenize(names_str)
+    expect(result).to eq([
+      [:name, "/Name1"],
+      [:name, "/ASomewhatLongerName"],
+      [:name, "/A;Name_With-Various***Characters?"],
+      [:name, "/1.2"],
+      [:name, "/$$"],
+      [:name, "/@pattern"],
+      [:name, "/.notdef"],
+      [:name, "/Adobe#20Green"],
+      [:name, "/PANTONE#205757#20CV"],
+      [:name, "/paired#28#29parentheses"],
+      [:name, "/The_Key_of_F#23_Minor"],
+      [:name, "/A#42"],
+      [:name, "/"]
+    ])
+  end
+
+  it 'handles paired braces and strings escapes' do
+    result = tokenize('
+      (Foo \\(with some bars\\))
+      (Foo () bar and (baz))
+      (Foo (with some bars))
+      (((())))
+    ')
+    expect(result).to eq(
+      [[:str, "Foo \\(with some bars\\)"], [:str, "Foo () bar and (baz)"], [:str, "Foo (with some bars)"], [:str, "((()))"]]
+    )
+  end
+
+  it 'detects an unterminated string' do
+    expect {
+      tokenize('(Hello there')
+    }.to raise_error(/did not terminate/)
+  end
+
+  it 'detects an unterminated array' do
+    expect {
+      tokenize('[')
+    }.to raise_error(/did not terminate/)
+  end
+
+  it 'detects an unterminated dictionary' do
+    expect {
+      tokenize('<< /Ohai')
+    }.to raise_error(/did not terminate/)
+  end
+
+  it 'detects a truncated dictionary opener' do
+    expect {
+      tokenize('<</')
+    }.to raise_error(/Dictionary did not terminate/)
+  end
+
+  it 'responds well to fuzzed input' do
+    random = Random.new(12345)
+    1024.times do
+      begin
+        result = tokenize(random.bytes(128))
+        expect(result).to be_kind_of(Array)
+      rescue FormatParser::PDFParser::Tokenizer::Malformed
+        # Everything good, we failed as we should
+      end
+    end
+  end
+end
diff --git a/spec/parsers/pdf_parser_spec.rb b/spec/parsers/pdf_parser_spec.rb
index 2a99f590..33fe477e 100644
--- a/spec/parsers/pdf_parser_spec.rb
+++ b/spec/parsers/pdf_parser_spec.rb
@@ -13,13 +13,14 @@
   shared_examples :behave_like_pdf do |hash|
     let(:pdf_file) { hash.fetch(:file) }
 
-    it 'acts as a pdf' do
+    it 'is recognized as PDF' do
       expect(parsed_pdf).not_to be_nil
       expect(parsed_pdf.nature).to eq(:document)
       expect(parsed_pdf.format).to eq(:pdf)
     end
 
     it 'has a correct page count' do
+      expect(parsed_pdf).not_to be_nil
       expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count))
     end
   end