diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb index f562dcbd..9a584b7e 100644 --- a/lib/parsers/pdf_parser.rb +++ b/lib/parsers/pdf_parser.rb @@ -1,4 +1,6 @@ class FormatParser::PDFParser + require_relative 'pdf_parser/tokenizer' + require_relative 'pdf_parser/transformer' include FormatParser::IOUtils # First 9 bytes of a PDF should be in this format, according to: @@ -14,62 +16,158 @@ class FormatParser::PDFParser # this. The only way of solving this correctly is by adding # different types of PDF's in the specs. # - COUNT_MARKERS = ['Count '] - EOF_MARKER = '%EOF' + EOF_MARKER = '%EOF' def call(io) io = FormatParser::IOConstraint.new(io) return unless safe_read(io, 9) =~ PDF_MARKER - attributes = scan_for_attributes(io) + io.seek(io.size - 5) + # return unless safe_read(io, 5) == '%%EOF' - FormatParser::Document.new( - format: :pdf, - page_count: attributes[:page_count] - ) - end + xref_offset = locate_xref_table_offset(io) + return unless xref_offset - private + io.seek(xref_offset) + xref_table = parse_xref_table(io) - # Read ahead bytes until one of % or / is reached. - # A header in a PDF always starts with a / - # The % is to detect the EOF - # - def scan_for_attributes(io) - result = {} - - while read = safe_read(io, 1) - case read - when '%' - break if safe_read(io, EOF_MARKER.size) == EOF_MARKER - when '/' - find_page_count(io, result) + xref_table.each do |xref| + # From here on out we need to proceed as follows. We need to buffer (preemptively) + # all the /Type/Pages objects for later. We also need to recover the + # /Type/Catalog object which will refer us to the right /Type /Pages object to use. + # It is a good idea to scan only once, and we also should be "economical" in reading these. + # All the objects we care about start with the object header ("45 0 obj" etc) + # and then must contain an arbitrary amount of whitespace (which we scientifically + # followed by the dictionary open brackets - "<<". + # Then we need to actually go in, read the object and parse the dictionary - luckily + # this is not that much trouble and we can read the entire object, since it is small. + # So let's get at it. + next if xref.length_limit > 1024 # Skip objects which are too large, they aren't what we are looking for anyway + + # Do a quickie detection reading just a tiny piece of the object. Strictly speaking we need + # to parse the entire object (what if there are 9000 spaces between "/Type" and "/Pages" ?) + # but in practice we should be able to get away with just a few things here. + io.seek(xref.offset) + obj_header = io.read(64).to_s + next unless obj_header.include?('/Pages') || obj_header.include?('/Linearized') + + # Seek to that object and read it whole, to the length limit or 1024 bytes whichever is lower + io.seek(xref.offset) + object_buf = io.read(min(1024, xref.length_limit)) + dict = extract_pdf_object_dictionary(object_buf) + if dict['/Type'] == '/Pages' && dict['/Count'] + return FormatParser::Document.new(format: :pdf, page_count: dict['/Count']) + elsif dict['/Linearized'] && dict['/N'] + return FormatParser::Document.new(format: :pdf, page_count: dict['/N']) end end - result + # We could not determine page count + FormatParser::Document.new(format: :pdf) + end + + def locate_xref_table_offset(io) + # Read the "tail" of the PDF and find the 'startxref' declaration + assumed_xref_table_size = 1024 + tail_pos = max(0, io.size - assumed_xref_table_size) + + io.seek(tail_pos) + tail = io.read(assumed_xref_table_size) + + # Find the "startxref" declaration and read the first group of integers after it + start_xref_index = tail.index('startxref') + return unless start_xref_index + + startxref = tail.byteslice(start_xref_index, assumed_xref_table_size)[/\d+/] + return unless startxref + + startxref.to_i end - def find_page_count(io, result) - COUNT_MARKERS.each do |marker| - if safe_read(io, marker.size) == marker - result[:page_count] = read_numbers(io) + XRef = Struct.new(:idx, :offset, :generation_number, :entry_type, :length_limit) + + def parse_xref_table(io) + xref_table = [] + starting_idx = 0 + num_objects_cross_check = nil + while line = read_until_linebreak(io, char_limit: 32) + case line + when /xref/ + # Starts the cross-reference table + when /^(\d+) (\d+)$/ + # Defines the starting number of the object and the number of objects in the table + starting_idx = $1.to_i + num_objects_cross_check = $2.to_i + when /^(\d{10}) (\d{5}) (\w)$/ + # The actual object offset. Set the length limit to a ridiculous value since we don't know it + xref_table << XRef.new(starting_idx + xref_table.length, $1.to_i, $2.to_i, $3, 99999999) + when /trailer/ + break end end + + # Check if the number of xrefs we got makes sense + if num_objects_cross_check && num_objects_cross_check != xref_table.length + raise "The xref table was declared to contain #{num_objects_cross_check} object refs but contained #{xref_table.length}" + end + + # Reject all disabled objects + xref_table.reject! { |e| e.entry_type == 'f' } + + # Sort sequentially in ascending offset in document order + xref_table.sort_by!(&:offset) + + # Update the limits which will tell us how much we need to read to have the entire object + pairwise(xref_table) do |xref_a, xref_b| + xref_a.length_limit = xref_b.offset - xref_a.offset + end + + xref_table end - # Read ahead bytes until no more numbers are found - # This assumes that the position of io starts at a - # number - def read_numbers(io) - numbers = '' + def pairwise(enum) + pair = [] + enum.each do |e| + pair << e + if pair.length == 2 + yield(pair.first, pair.last) + pair.shift + end + end + end - while c = safe_read(io, 1) - c =~ /\d+/ ? numbers << c : break + def read_until_delimiter(io, delimiter:, char_limit: 32) + buf = StringIO.new(''.b) + char_limit.times do + buf << safe_read(io, 1).force_encoding(Encoding::BINARY) + break if buf.string.end_with?(delimiter) || buf.string.bytesize >= char_limit end + buf.string.strip + end + + def read_until_linebreak(io, char_limit: 32) + read_until_delimiter(io, delimiter: "\n", char_limit: char_limit) + end + + def min(*of_items) + of_items.sort.shift + end + + def max(*of_items) + of_items.sort.pop + end - numbers.to_i + def extract_pdf_object_dictionary(str) + token_stream = Tokenizer.new.tokenize(str) + tree = Transformer.new.transform(token_stream) + # Locate the first hash (dictionary) in the parse tree + first_hash = tree.find {|e| e.is_a?(Hash) } + first_hash || {} + rescue => e + $stderr.puts e + # Malformed PDF object or our parser has failed somewhere + {} end FormatParser.register_parser self, natures: :document, formats: :pdf diff --git a/lib/parsers/pdf_parser/tokenizer.rb b/lib/parsers/pdf_parser/tokenizer.rb new file mode 100644 index 00000000..63528735 --- /dev/null +++ b/lib/parsers/pdf_parser/tokenizer.rb @@ -0,0 +1,194 @@ +require 'strscan' + +class FormatParser::PDFParser::Tokenizer + Malformed = Class.new(RuntimeError) + RE = ->(str) { /#{Regexp.escape(str)}/ } + + NAME_RE = begin + # The ASCII subset permissible for PDF name values + printable_ascii = (32..126).to_a + printable_ascii.delete(' '.ord) + printable_ascii.delete('['.ord) + printable_ascii.delete(']'.ord) + printable_ascii.delete('<'.ord) + printable_ascii.delete('>'.ord) + printable_ascii.delete('('.ord) + printable_ascii.delete(')'.ord) + printable_ascii.delete('/'.ord) + printable_ascii.delete('\\'.ord) + exact_char_class = printable_ascii.map(&:chr).join + + /\/[#{exact_char_class}]{0,}/ + end + + STRATEGIES = { + RE['<<'] => :parse_dictionary, + RE['['] => :parse_array, + RE['('] => :parse_string, + /<[0-9a-f]+>/i => :parse_hex_string, + /\d+ \d+ R/ => :parse_ref, + NAME_RE => :parse_pdf_name, + + RE['true'] => :wrap_lit, + RE['false'] => :wrap_lit, + RE['null'] => :wrap_lit, + + # 34.5 −3.62 +123.6 4. −.002 0.0 are all valid reals + /(\-|\+?)(\d+)\.(\d+)/ => :wrap_real, + /(\-|\+?)(\d+)\./ => :wrap_real, + /(\-|\+?)\.(\d+)/ => :wrap_real, + /\-?(\d+)/ => :wrap_int, + + RE['obj'] => :wrap_lit, + # Use dirty trick to stop parsing if we encounter anything binary. This does not + # prevent us from reading ahead into the stream, but it does allow is to abort + # quicker + RE['endobj'] => :abort, + RE['stream'] => :abort, + RE['endstream'] => :abort, + + /\s+/ => :wrap_whitespace, + /./ => :garbage, + } + + def wrap_real(pattern) + [:real, @sc.scan(pattern)] + end + + def wrap_int(pattern) + [:int, @sc.scan(pattern)] + end + + def wrap_whitespace(pattern) + @sc.scan(pattern) + [:whitespace, nil] + end + + def wrap_lit(pattern) + [:lit, @sc.scan(pattern).to_sym] + end + + def consume!(pattern, method_name) + at = @sc.pos + return false unless @sc.check(pattern) + debug { "M: #{method_name} @#{at}: 8 chars after scan pointer #{@sc.peek(8).inspect}" } + result = send(method_name, pattern) + @token_stream << result unless result == [:whitespace, nil] + true + end + + def parse_ref(start_pattern) + [:ref, @sc.scan(start_pattern)] + end + + def parse_array(start_pattern) + @sc.scan(start_pattern) # consume [ + dict_open_at = @token_stream.length + walk_scanner(RE[']']) + raise Malformed, 'Array did not terminate' unless @token_stream.pop == :terminator + array_items = @token_stream.pop(@token_stream.length - dict_open_at) + [:array, array_items] + end + + def parse_dictionary(start_pattern) + @sc.scan(start_pattern) # consume << + dict_open_at = @token_stream.length + walk_scanner(RE['>>']) + raise Malformed, 'Dictionary did not terminate' unless @token_stream.pop == :terminator + dict_items = @token_stream.pop(@token_stream.length - dict_open_at) + [:dict, dict_items] + end + + def parse_hex_string(start_pattern) + [:hex_string, @sc.scan(start_pattern)] + end + + def parse_string(opening_brace_pattern) + # This is murder. PDF allows paired braces to be put into a string literal + # without any escaping. This means that "(Horrible file format (with a cherry on top))" + # is a valid string. Needs attention. + @sc.scan(opening_brace_pattern) # just the "(" + str = "" + count = 1 + bytes_remaining_to_scan.times do + # Terminate if EOS reached or once we encountered the outermost closing brace + break if @sc.eos? || count == 0 + + byte = @sc.scan(/./) + if byte.nil? + count = 0 # unbalanced parens + elsif byte == 0x5C.chr # "\" + str << byte << @sc.scan(/\./).to_s + elsif byte == 0x28.chr # "(" + str << "(" + count += 1 + elsif byte == 0x29.chr # ")" + count -= 1 + str << ")" unless count == 0 + else + str << byte unless count == 0 + end + break if count == 0 + end + raise Malformed, "String did not terminate at #{@sc.pos}" if count > 0 + [:str, str] + end + + def parse_pdf_name(start_pattern) + [:name, @sc.scan(start_pattern)] + end + + def garbage(*) + raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one" + end + + def bytes_remaining_to_scan + @sc.string.bytesize - @sc.pos + end + + def walk_scanner(halt_at_pattern) + # Limit the iterations to AT MOST (!) once per + # remaining byte to parse. This ensures we won't + # have parsing enter an infinite loop where we expect + # the string scanner to have advanced at least a byte forward + # but it would sit on the same offset indifinitely. + bytes_remaining_to_scan.times do + # Terminate if EOS reached + break if @sc.eos? + + # Terminate early + if halt_at_pattern && halted = @sc.scan(halt_at_pattern) + @token_stream << :terminator + return + end + + # Walk through STRATEGIES and stop iterating on first non-false call to consume! + # STRATEGIES are arranged by order of specificity, so for most iterations + # somethign meaningful should be hit relatively quickly + STRATEGIES.find do |pattern, method_name| + consume!(pattern, method_name) + end + end + end + + # Dirty thing we use to stop parsing as soon as we encounter a "stream", "xstream" + def abort(pattern) + str = @sc.scan(pattern) + debug { "X: Aborting tokenization at #{str.inspect} @#{@sc.pos}" } + throw :_abort_ + end + + def tokenize(str, verbose: false) + @verbose = verbose + @sc = StringScanner.new(str.force_encoding(Encoding::BINARY)) + @token_stream = [] + catch :_abort_ do + walk_scanner(_stop_at_pattern = nil) + end + @token_stream + end + + def debug + warn(yield) if @verbose + end +end diff --git a/lib/parsers/pdf_parser/transformer.rb b/lib/parsers/pdf_parser/transformer.rb new file mode 100644 index 00000000..7aab792a --- /dev/null +++ b/lib/parsers/pdf_parser/transformer.rb @@ -0,0 +1,104 @@ +class FormatParser::PDFParser::Transformer + class PDFRef < Struct.new(:object_id, :object_gen) + def self.from_ref_str(str) + id_and_generation_str = str.scan(/(\d+) (\d+) R/).first + new(*id_and_generation_str.map(&:to_i)) + end + end + + # Permitted character escapes. There aren't _that_ many so we can use a table + STRING_ESCAPES = { + "\r" => "\n", + "\n\r" => "\n", + "\r\n" => "\n", + '\\n' => "\n", + '\\r' => "\r", + '\\t' => "\t", + '\\b' => "\b", + '\\f' => "\f", + '\\(' => '(', + '\\)' => ')', + '\\\\' => '\\', + "\\\n" => '', + } + + # Octal character escapes that look like \001 etc + 0.upto(9) { |n| STRING_ESCAPES['\\00' + n.to_s] = ('00' + n.to_s).oct.chr } + 0.upto(99) { |n| STRING_ESCAPES['\\0' + n.to_s] = ('0' + n.to_s).oct.chr } + 0.upto(377) { |n| STRING_ESCAPES['\\' + n.to_s] = n.to_s.oct.chr } + + LITERAL_VALUES = { + :true => true, + :false => false, + :null => nil, + } + + def transform(tokens) + tokens.map {|t| unwrap(*t) } + end + + def unwrap(token_type, token_value) + case token_type + when :dict + unwrap_dict(token_value) + when :array + unwrap_array(token_value) + when :real + unwrap_real(token_value) + when :int + unwrap_int(token_value) + when :ref + unwrap_ref(token_value) + when :name + unwrap_name(token_value) + when :lit + unwrap_lit(token_value) + else + token_value + end + end + + def unwrap_real(value) + value.to_f + end + + def unwrap_int(value) + value.to_i + end + + def unwrap_dict(value) + unwrapped_values = value.map{|e| unwrap(*e) } + keys, values = unwrapped_values.partition.with_index {|_, i| i % 2 == 0 } + Hash[keys.zip(values)] + end + + def unwrap_lit(value) + LITERAL_VALUES.fetch(value, value.to_sym) + end + + def unwrap_ref(value) + PDFRef.from_ref_str(value) + end + + def unwrap_array(value) + value.map {|e| unwrap(*e) } + end + + def unwrap_hex_string(str) + str << '0' unless str.bytesize.even? + str.scan(/../).map { |i| i.hex.chr }.join + end + + def unwrap_string(str) + str.gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| + STRING_ESCAPES[match] || '' + end + end + + def unwrap_name(name) + # Replace #0xx hex codes with the corresponding chars + name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |_hex_code| + $1.to_i(16).chr + end + end +end diff --git a/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj b/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj new file mode 100644 index 00000000..2530644b --- /dev/null +++ b/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj @@ -0,0 +1,7 @@ +44 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj b/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj new file mode 100644 index 00000000..e3ff9300 --- /dev/null +++ b/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj @@ -0,0 +1,7 @@ +12 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj b/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj new file mode 100644 index 00000000..c2f92e6d --- /dev/null +++ b/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj @@ -0,0 +1,9 @@ + 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj b/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj new file mode 100644 index 00000000..4fe62abd --- /dev/null +++ b/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj @@ -0,0 +1,8 @@ +4 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj b/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj new file mode 100644 index 00000000..1cb8e07f --- /dev/null +++ b/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj @@ -0,0 +1,8 @@ +31 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/example_a.pdfobj b/spec/parsers/pdf_parser/example_a.pdfobj new file mode 100644 index 00000000..6d82200b --- /dev/null +++ b/spec/parsers/pdf_parser/example_a.pdfobj @@ -0,0 +1,9 @@ +[ + << + /Name (Jim) + /Age 39 + /Children [(Heather) (Timothy) (Rebecca)] + >> + 22 + 44.55 +] \ No newline at end of file diff --git a/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj b/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj new file mode 100644 index 00000000..24c92285 --- /dev/null +++ b/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj @@ -0,0 +1,8 @@ +7 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/tokenizer_spec.rb b/spec/parsers/pdf_parser/tokenizer_spec.rb new file mode 100644 index 00000000..ec45f628 --- /dev/null +++ b/spec/parsers/pdf_parser/tokenizer_spec.rb @@ -0,0 +1,179 @@ +require 'spec_helper' + +describe FormatParser::PDFParser::Tokenizer do + def tokenize(str) + FormatParser::PDFParser::Tokenizer.new.tokenize(str) + end + + def tokenize_file_at(at_path) + FormatParser::PDFParser::Tokenizer.new.tokenize(File.read(at_path)) + end + + describe 'with extracted objects from corpus' do + fixture_paths = Dir.glob(__dir__ + '/*.pdfobj').sort + fixture_paths.each do |path| + it "scans #{File.basename(path)}" do + result = tokenize_file_at(path) + require 'pp' + pp result + end + end + end + + it 'scans the example object from the PDF presentation' do + result = tokenize_file_at(__dir__ + '/example_a.pdfobj') + expect(result).to eq( + [ + [ + :array, [ + [ + :dict, [ + [:name, '/Name'], 'Jim', + [:name, '/Age'], [:int, 39], + [:name, '/Children'], + [:array, [ + [:str, 'Heather'], + [:str, 'Timothy'], + [:str, 'Rebecca'] + ]] + ] + ], + [:int, 22], + [:real, 44.55] + ] + ] + ] + ) + end + + it 'scans a simple dictionary with strings and ints as values' do + result = tokenize('<>') + expect(result).to eq( + [[:dict, [[:name, "/Name"], [:str, "Jim"], [:name, "/Age"], [:int, "25"]]]] + ) + end + + it 'scans a simple dictionary with arbitrary whitespace' do + result = tokenize('<< + /Name + (Jim) + /Age + 25>>') + expect(result).to eq( + [[:dict, [[:name, "/Name"], [:str, "Jim"], [:name, "/Age"], [:int, "25"]]]] + ) + end + + it 'parses all kinds of reals' do + result = tokenize('34.5 -3.62 +123.6 4. -.002 0.0') + expect(result).to eq( + [[:real, "34.5"], [:real, "-3.62"], [:real, "+123.6"], [:real, "4."], [:real, "-.002"], [:real, "0.0"]] + ) + end + + it 'parses an array of integers' do + result = tokenize('[1 2 3 4]') + expect(result).to eq( + [[:array, [[:int, "1"], [:int, "2"], [:int, "3"], [:int, "4"]]]] + ) + end + + it 'scans an array of integers with one object ref in the middle' do + result = tokenize('[1 20 00 R 3]') + expect(result).to eq( + [[:array, [[:int, "1"], [:ref, "20 00 R"], [:int, "3"]]]] + ) + end + + it 'scans an array of names' do + result = tokenize('[ /Type /Color /Medium/Rare ]') + expect(result).to eq( + [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]] + ) + + result = tokenize('[/Type/Color/Medium/Rare]') + expect(result).to eq( + [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]] + ) + end + + it 'handles names' do + names_str = %( + /Name1 + /ASomewhatLongerName /A;Name_With-Various***Characters? /1.2 + /$$ + /@pattern + /.notdef + /Adobe#20Green + /PANTONE#205757#20CV + /paired#28#29parentheses + /The_Key_of_F#23_Minor + /A#42 + / + ) + result = tokenize(names_str) + expect(result).to eq([ + [:name, "/Name1"], + [:name, "/ASomewhatLongerName"], + [:name, "/A;Name_With-Various***Characters?"], + [:name, "/1.2"], + [:name, "/$$"], + [:name, "/@pattern"], + [:name, "/.notdef"], + [:name, "/Adobe#20Green"], + [:name, "/PANTONE#205757#20CV"], + [:name, "/paired#28#29parentheses"], + [:name, "/The_Key_of_F#23_Minor"], + [:name, "/A#42"], + [:name, "/"] + ]) + end + + it 'handles paired braces and strings escapes' do + result = tokenize(' + (Foo \\(with some bars\\)) + (Foo () bar and (baz)) + (Foo (with some bars)) + (((()))) + ') + expect(result).to eq( + [[:str, "Foo \\(with some bars\\)"], [:str, "Foo () bar and (baz)"], [:str, "Foo (with some bars)"], [:str, "((()))"]] + ) + end + + it 'detects an unterminated string' do + expect { + tokenize('(Hello there') + }.to raise_error(/did not terminate/) + end + + it 'detects an unterminated array' do + expect { + tokenize('[') + }.to raise_error(/did not terminate/) + end + + it 'detects an unterminated dictionary' do + expect { + tokenize('<< /Ohai') + }.to raise_error(/did not terminate/) + end + + it 'detects a truncated dictionary opener' do + expect { + tokenize('<