From c8114cbb13491d117883fdcf7ee51873d2677e87 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Tue, 12 Jun 2018 20:51:20 +0200 Subject: [PATCH 01/18] Yes. Let's play tough. --- lib/parsers/pdf_parser.rb | 118 ++++++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 30 deletions(-) diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb index f562dcbd..b7a7d8a8 100644 --- a/lib/parsers/pdf_parser.rb +++ b/lib/parsers/pdf_parser.rb @@ -22,54 +22,112 @@ def call(io) return unless safe_read(io, 9) =~ PDF_MARKER - attributes = scan_for_attributes(io) + io.seek(io.size - 5) +# return unless safe_read(io, 5) == '%%EOF' + xref_offset = locate_xref_table_offset(io) + return unless xref_offset + + io.seek(xref_offset) + xref_table = parse_xref_table(io) + + # return unless xref_table.any? + xref_table.each do |xref| + io.seek(xref.offset) + $stderr.puts io.read(xref.length_limit).inspect + end + + raise "nope" FormatParser::Document.new( format: :pdf, page_count: attributes[:page_count] ) end - private + def locate_xref_table_offset(io) + # Read the "tail" of the PDF and find the 'startxref' declaration + assumed_xref_table_size = 1024 + tail_pos = io.size - assumed_xref_table_size + tail_pos = 0 if tail_pos < 0 + io.seek(tail_pos) + tail = io.read(assumed_xref_table_size) - # Read ahead bytes until one of % or / is reached. - # A header in a PDF always starts with a / - # The % is to detect the EOF - # - def scan_for_attributes(io) - result = {} - - while read = safe_read(io, 1) - case read - when '%' - break if safe_read(io, EOF_MARKER.size) == EOF_MARKER - when '/' - find_page_count(io, result) + # Find the "startxref" declaration and read the first group of integers after it + start_xref_index = tail.index('startxref') + return unless start_xref_index + + startxref = tail.byteslice(start_xref_index, 1024)[/\d+/] + return unless startxref + + startxref.to_i + end + + XRef = Struct.new(:idx, :offset, :generation_number, :entry_type, :length_limit) + def parse_xref_table(io) + xref_table = [] + starting_idx = 0 + num_objects_cross_check = nil + while line = read_until_linebreak(io, char_limit: 32) + case line + when /xref/ + # Starts the cross-reference table + when /^(\d+) (\d+)$/ + # Defines the starting number of the object and the number of objects in the table + starting_idx = $1.to_i + num_objects_cross_check = $2.to_i + when /^(\d{10}) (\d{5}) (\w)$/ + # The actual object offset. Set the length limit to a ridiculous value since we don't know it + xref_table << XRef.new(starting_idx + xref_table.length, $1.to_i, $2.to_i, $3, 99999999) + when /trailer/ + break end end - result + # Check if the number of xrefs we got makes sense + if num_objects_cross_check && num_objects_cross_check != xref_table.length + raise "The xref table was declared to contain #{num_objects_cross_check} object refs but contained #{xref_table.length}" + end + + # Reject all disabled objects + xref_table.reject! {|e| e.entry_type == 'f' } + + # Sort sequentially in ascending offset in document order + xref_table.sort_by!(&:offset) + + # Update the limits which will tell us how much we need to read to have the entire object + pairwise(xref_table) do |xref_a, xref_b| + xref_a.length_limit = xref_b.offset - xref_a.offset + end + + xref_table.each do |x| + $stderr.puts x.inspect + end + + xref_table end - def find_page_count(io, result) - COUNT_MARKERS.each do |marker| - if safe_read(io, marker.size) == marker - result[:page_count] = read_numbers(io) + def pairwise(enum) + pair = [] + enum.each do |e| + pair << e + if pair.length == 2 + yield(pair.first, pair.last) + pair.shift end end end - # Read ahead bytes until no more numbers are found - # This assumes that the position of io starts at a - # number - def read_numbers(io) - numbers = '' - - while c = safe_read(io, 1) - c =~ /\d+/ ? numbers << c : break + def read_until_linebreak(io, char_limit: 32) + buf = StringIO.new(''.b) + char_limit.times do + char = safe_read(io, 1).force_encoding(Encoding::BINARY) + if char == "\n" + break + else + buf << char + end end - - numbers.to_i + buf.string.strip end FormatParser.register_parser self, natures: :document, formats: :pdf From f50f18cec92fc47eaf6dd7d949db30d948c5cad8 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Tue, 12 Jun 2018 21:29:30 +0200 Subject: [PATCH 02/18] Yep yep yep --- lib/parsers/pdf_parser.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb index b7a7d8a8..00d73aa5 100644 --- a/lib/parsers/pdf_parser.rb +++ b/lib/parsers/pdf_parser.rb @@ -34,7 +34,9 @@ def call(io) # return unless xref_table.any? xref_table.each do |xref| io.seek(xref.offset) - $stderr.puts io.read(xref.length_limit).inspect + if xref.length_limit < 128 + $stderr.puts io.read(xref.length_limit).inspect + end end raise "nope" @@ -49,6 +51,7 @@ def locate_xref_table_offset(io) assumed_xref_table_size = 1024 tail_pos = io.size - assumed_xref_table_size tail_pos = 0 if tail_pos < 0 + io.seek(tail_pos) tail = io.read(assumed_xref_table_size) @@ -56,7 +59,7 @@ def locate_xref_table_offset(io) start_xref_index = tail.index('startxref') return unless start_xref_index - startxref = tail.byteslice(start_xref_index, 1024)[/\d+/] + startxref = tail.byteslice(start_xref_index, assumed_xref_table_size)[/\d+/] return unless startxref startxref.to_i @@ -102,7 +105,7 @@ def parse_xref_table(io) xref_table.each do |x| $stderr.puts x.inspect end - + xref_table end From 75f513c9df5d8d5a8b6fab435833340af67f7f19 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Wed, 13 Jun 2018 03:40:22 +0200 Subject: [PATCH 03/18] Yes yes --- ...fe693a8f87297f8c255d899c61b063016a4.pdfobj | 7 + ...4f5f4914c1f71f4916f75894e6a89e780d6.pdfobj | 7 + ...c658d480be83743bd54554dabc4a6681bce.pdfobj | 9 + ...7eb7632f0370bc3463a3884acdf3386ed83.pdfobj | 7 + ...a257840100914b09ce644376375450a0611.pdfobj | 8 + ...9334c1be555436ca56d50f5a12df1c701f6.pdfobj | 8 + spec/parsers/pdf_parser/example_a.pdfobj | 9 + ...a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj | 8 + spec/parsers/pdf_parser/object_parser_spec.rb | 248 ++++++++++++++++++ 9 files changed, 311 insertions(+) create mode 100644 spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj create mode 100644 spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj create mode 100644 spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj create mode 100644 spec/parsers/pdf_parser/82cc57eb7632f0370bc3463a3884acdf3386ed83.pdfobj create mode 100644 spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj create mode 100644 spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj create mode 100644 spec/parsers/pdf_parser/example_a.pdfobj create mode 100644 spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj create mode 100644 spec/parsers/pdf_parser/object_parser_spec.rb diff --git a/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj b/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj new file mode 100644 index 00000000..2530644b --- /dev/null +++ b/spec/parsers/pdf_parser/35dadfe693a8f87297f8c255d899c61b063016a4.pdfobj @@ -0,0 +1,7 @@ +44 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj b/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj new file mode 100644 index 00000000..e3ff9300 --- /dev/null +++ b/spec/parsers/pdf_parser/3e2044f5f4914c1f71f4916f75894e6a89e780d6.pdfobj @@ -0,0 +1,7 @@ +12 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj b/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj new file mode 100644 index 00000000..c2f92e6d --- /dev/null +++ b/spec/parsers/pdf_parser/52a0ac658d480be83743bd54554dabc4a6681bce.pdfobj @@ -0,0 +1,9 @@ + 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj b/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj new file mode 100644 index 00000000..4fe62abd --- /dev/null +++ b/spec/parsers/pdf_parser/dc7b1a257840100914b09ce644376375450a0611.pdfobj @@ -0,0 +1,8 @@ +4 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj b/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj new file mode 100644 index 00000000..1cb8e07f --- /dev/null +++ b/spec/parsers/pdf_parser/e2e269334c1be555436ca56d50f5a12df1c701f6.pdfobj @@ -0,0 +1,8 @@ +31 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/example_a.pdfobj b/spec/parsers/pdf_parser/example_a.pdfobj new file mode 100644 index 00000000..6d82200b --- /dev/null +++ b/spec/parsers/pdf_parser/example_a.pdfobj @@ -0,0 +1,9 @@ +[ + << + /Name (Jim) + /Age 39 + /Children [(Heather) (Timothy) (Rebecca)] + >> + 22 + 44.55 +] \ No newline at end of file diff --git a/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj b/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj new file mode 100644 index 00000000..24c92285 --- /dev/null +++ b/spec/parsers/pdf_parser/f1938a7cc1749ac952c1d5c3fe708f81afc95aa8.pdfobj @@ -0,0 +1,8 @@ +7 0 obj +<> +endobj + diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb new file mode 100644 index 00000000..490d75ad --- /dev/null +++ b/spec/parsers/pdf_parser/object_parser_spec.rb @@ -0,0 +1,248 @@ +require 'spec_helper' + +class NuObjectParser + Malformed = Class.new(RuntimeError) + RE = ->(str) { /#{Regexp.escape(str)}/ } + STRATEGIES = { + RE["/"] => :parse_pdf_name, + RE["<<"] => :parse_dictionary, + RE["["] => :parse_array, + RE["("] => :parse_string, + RE["<"] => :parse_hex_string, + /\d+ \d+ R/ => :parse_ref, + + RE["true"] => :wrap, + RE["false"] => :wrap, + RE["null"] => :wrap, + + /\-?(\d+)\.(\d+)/ => :wrap_real, + /\-?(\d+)/ => :wrap_int, + + RE["obj"] => :wrap, + RE["endobj"] => :wrap, + RE["stream"] => :wrap, + RE["endstream"] => :wrap, +# RE[">>"] => :wrap, +# RE["]"] => :wrap, +# RE[">"] => :wrap, +# RE[")"] => :wrap, + + /\s+/ => :wrap_whitespace, + } + + STRING_ESCAPES = { + "\r" => "\n", + "\n\r" => "\n", + "\r\n" => "\n", + "\\n" => "\n", + "\\r" => "\r", + "\\t" => "\t", + "\\b" => "\b", + "\\f" => "\f", + "\\(" => "(", + "\\)" => ")", + "\\\\" => "\\", + "\\\n" => "", + } + 0.upto(9) { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr } + 0.upto(99) { |n| STRING_ESCAPES["\\0" + n.to_s] = ("0"+n.to_s).oct.chr } + 0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s] = n.to_s.oct.chr } + + def wrap_true(sc, pattern) + @sc.scan(pattern) + true + end + + def wrap_false(pattern) + @sc.scan(pattern) + false + end + + def wrap_nil(pattern) + @sc.scan(pattern) + nil + end + + def wrap_real(pattern) + @sc.scan(pattern).to_f + end + + def wrap_int(pattern) + @sc.scan(pattern).to_i + end + + def wrap_whitespace(pattern) + @sc.scan(pattern) + :whitespace + end + + def wrap(pattern) + data = @sc.scan(pattern) + data.to_sym + end + + def consume!(pattern, method_name) + return unless @sc.check(pattern) + at = @sc.pos + result = send(method_name, pattern) + @token_stream << result unless result == :whitespace + true + end + + def parse_ref(start_pattern) + [:ref, @sc.scan(start_pattern)] + end + + def parse_array(start_pattern) + @sc.scan(start_pattern) # consume [ + dict_open_at = @token_stream.length + walk_scanner(RE["]"]) + raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator + array_items = @token_stream.pop(@token_stream.length - dict_open_at) + [:array, array_items] + end + + def parse_dictionary(start_pattern) + @sc.scan(start_pattern) # consume << + dict_open_at = @token_stream.length + walk_scanner(RE[">>"]) + raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator + dict_items = @token_stream.pop(@token_stream.length - dict_open_at) + [:dict, dict_items] + end + + def parse_string(start_pattern) + rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped ) + raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string + rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| + STRING_ESCAPES[match] || "" + end + end + + def parse_pdf_name(start_pattern) + letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join + "/" + warn("Name parsing needs validation since start pattern is not the same as scan pattern") + [:name, @sc.scan(/\/[#{letters}\d]+/)] + end + + def walk_scanner(halt_at_pattern) + until @sc.eos? + # Terminate early + if halt_at_pattern && halted = @sc.scan(halt_at_pattern) + @token_stream << :terminator + return + end + + # Walk through STRATEGIES and stop iterating on first non-false call to consume! + STRATEGIES.find do |pattern, method_name| + consume!(pattern, method_name) + end + end + end + + def parse(str) + @sc = StringScanner.new(str) + @token_stream = [] + walk_scanner(_stop_at_pattern = nil) + @token_stream + end +end + +describe 'Object parser' do + let(:fixture_paths) { Dir.glob(__dir__ + '/*.pdfobj').sort } + + xit 'scans the extracted object definitions from the corpus' do + fixture_paths.each do |path| + result = NuObjectParser.new.parse(File.read(path)) + end + end + + it 'scans the example object from the PDF presentation' do + obj = File.read(__dir__ + '/example_a.pdfobj') + parser = NuObjectParser.new + result = parser.parse(obj) + expect(result).to eq( + [ + [:array, [ + [:dict, [ + [:name, "/Name"], "Jim", + [:name, "/Age"], 39, + [:name, "/Children"], [:array, ["Heather", "Timothy", "Rebecca"]]] + ], + 22, + 44.55] + ] + ] + ) + end + + it 'scans a simple dictionary with strings and ints as values' do + result = NuObjectParser.new.parse('<>') + expect(result).to eq( + [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], 25]]] + ) + end + + it 'scans a simple dictionary with arbitrary whitespace' do + result = NuObjectParser.new.parse('<< + /Name + (Jim) + /Age + 25>>') + expect(result).to eq( + [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], 25]]] + ) + end + + it 'parses an array of integers' do + result = NuObjectParser.new.parse('[1 2 3 4]') + expect(result).to eq( + [[:array, [1, 2, 3, 4]]] + ) + end + + it 'scans an array of integers with one object ref in the middle' do + result = NuObjectParser.new.parse('[1 20 00 R 3]') + expect(result).to eq( + [[:array, [1, [:ref, "20 00 R"], 3]]] + ) + end + + it 'scans an array of names' do + result = NuObjectParser.new.parse('[ /Type /Color /Medium/Rare ]') + expect(result).to eq( + [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium/Rare"]]]] + ) + end + + it 'handles string escapes' do + result = NuObjectParser.new.parse("(Foo \\(with some bars\\))") + expect(result).to eq( + ["Foo (with some bars)"] + ) + end + + it 'detects an unterminated string' do + expect { + NuObjectParser.new.parse("(Hello there") + }.to raise_error(/did not terminate/) + end + + it 'detects an unterminated array' do + expect { + NuObjectParser.new.parse("[") + }.to raise_error(/did not terminate/) + end + + it 'detects an unterminated dictionary' do + expect { + NuObjectParser.new.parse("<< /Ohai") + }.to raise_error(/did not terminate/) + end + + it 'detects a truncated dictionary opener' do + expect { + NuObjectParser.new.parse('< Date: Wed, 13 Jun 2018 11:24:48 +0200 Subject: [PATCH 04/18] Comment more stuff --- lib/parsers/pdf_parser.rb | 41 ++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb index 00d73aa5..9510236b 100644 --- a/lib/parsers/pdf_parser.rb +++ b/lib/parsers/pdf_parser.rb @@ -14,7 +14,6 @@ class FormatParser::PDFParser # this. The only way of solving this correctly is by adding # different types of PDF's in the specs. # - COUNT_MARKERS = ['Count '] EOF_MARKER = '%EOF' def call(io) @@ -34,8 +33,24 @@ def call(io) # return unless xref_table.any? xref_table.each do |xref| io.seek(xref.offset) - if xref.length_limit < 128 - $stderr.puts io.read(xref.length_limit).inspect + # From here on out we need to proceed as follows. We need to buffer (preemptively) + # all the /Type/Pages objects for later. We also need to recover the + # /Type/Catalog object which will refer us to the right /Type /Pages object to use. + # It is a good idea to scan only once, and we also should be "economical" in reading these. + # All the objects we care about start with the object header ("45 0 obj" etc) + # and then must contain an arbitrary amount of whitespace (which we scientifically + # followed by the dictionary open brackets - "<<". + # Then we need to actually go in, read the object and parse the dictionary - luckily + # this is not that much trouble and we can read the entire object, since it is small. + # So let's get at it. + next if xref.length_limit > 1024 # Skip objects which are too large, they won't be headers anyway + + # Do a quickie detection reading just a tiny piece of the object + obj_header = safe_read(io, 32) + if obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog') + io.seek(xref.offset) + object_buf = io.read(xref.length_limit) + parse_object_with_dictionary(object_buf) end end @@ -49,8 +64,7 @@ def call(io) def locate_xref_table_offset(io) # Read the "tail" of the PDF and find the 'startxref' declaration assumed_xref_table_size = 1024 - tail_pos = io.size - assumed_xref_table_size - tail_pos = 0 if tail_pos < 0 + tail_pos = max(0, io.size - assumed_xref_table_size) io.seek(tail_pos) tail = io.read(assumed_xref_table_size) @@ -66,6 +80,7 @@ def locate_xref_table_offset(io) end XRef = Struct.new(:idx, :offset, :generation_number, :entry_type, :length_limit) + def parse_xref_table(io) xref_table = [] starting_idx = 0 @@ -102,10 +117,6 @@ def parse_xref_table(io) xref_a.length_limit = xref_b.offset - xref_a.offset end - xref_table.each do |x| - $stderr.puts x.inspect - end - xref_table end @@ -133,5 +144,17 @@ def read_until_linebreak(io, char_limit: 32) buf.string.strip end + def min(*of_items) + of_items.sort.shift + end + + def max(*of_items) + of_items.sort.pop + end + + def parse_object_with_dictionary(str) + File.open(Digest::SHA1.hexdigest(str) + '.pdfobj', 'wb') {|f| f << str } + end + FormatParser.register_parser self, natures: :document, formats: :pdf end From eb0670ef3a4cff3bd79b91e49e87dd6f9db2cf6b Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Wed, 13 Jun 2018 11:25:16 +0200 Subject: [PATCH 05/18] Hex string handling --- spec/parsers/pdf_parser/object_parser_spec.rb | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb index 490d75ad..678049b9 100644 --- a/spec/parsers/pdf_parser/object_parser_spec.rb +++ b/spec/parsers/pdf_parser/object_parser_spec.rb @@ -28,6 +28,7 @@ class NuObjectParser # RE[")"] => :wrap, /\s+/ => :wrap_whitespace, + /./ => :garbage, } STRING_ESCAPES = { @@ -111,6 +112,15 @@ def parse_dictionary(start_pattern) [:dict, dict_items] end + def parse_hex_string(start_pattern) + str = @sc.scan(/<[0-9a-f]+>/i) + raise Malformed, "Malformed hex string at #{@sc.pos}" unless str + + str << "0" unless str.size % 2 == 0 + hex_str = str.scan(/../).map {|i| i.hex.chr}.join + [:hex_string, hex_str] + end + def parse_string(start_pattern) rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped ) raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string @@ -121,12 +131,20 @@ def parse_string(start_pattern) def parse_pdf_name(start_pattern) letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join + "/" - warn("Name parsing needs validation since start pattern is not the same as scan pattern") - [:name, @sc.scan(/\/[#{letters}\d]+/)] + name = @sc.scan(/\/[#{letters}\d]+/) + raise Malformed, "Expected a well-formed PDF name at #{@sc.pos} but could not recover any" unless name + [:name, name] end - + + def garbage(*) + raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one" + end + def walk_scanner(halt_at_pattern) - until @sc.eos? + (@sc.string.bytesize - @sc.pos).times do + # Terminate if EOS reached + break if @sc.eos? + # Terminate early if halt_at_pattern && halted = @sc.scan(halt_at_pattern) @token_stream << :terminator @@ -151,7 +169,7 @@ def parse(str) describe 'Object parser' do let(:fixture_paths) { Dir.glob(__dir__ + '/*.pdfobj').sort } - xit 'scans the extracted object definitions from the corpus' do + it 'scans the extracted object definitions from the corpus' do fixture_paths.each do |path| result = NuObjectParser.new.parse(File.read(path)) end @@ -243,6 +261,19 @@ def parse(str) it 'detects a truncated dictionary opener' do expect { NuObjectParser.new.parse('< Date: Thu, 14 Jun 2018 13:48:11 +0200 Subject: [PATCH 06/18] Deal with reals better --- spec/parsers/pdf_parser/nu_object_parser.rb | 152 +++++++++++++ spec/parsers/pdf_parser/object_parser_spec.rb | 213 +++--------------- 2 files changed, 187 insertions(+), 178 deletions(-) create mode 100644 spec/parsers/pdf_parser/nu_object_parser.rb diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb new file mode 100644 index 00000000..300d1a48 --- /dev/null +++ b/spec/parsers/pdf_parser/nu_object_parser.rb @@ -0,0 +1,152 @@ +class NuObjectParser + Malformed = Class.new(RuntimeError) + RE = ->(str) { /#{Regexp.escape(str)}/ } + STRATEGIES = { + RE["/"] => :parse_pdf_name, + RE["<<"] => :parse_dictionary, + RE["["] => :parse_array, + RE["("] => :parse_string, + RE["<"] => :parse_hex_string, + /\d+ \d+ R/ => :parse_ref, + + RE["true"] => :wrap, + RE["false"] => :wrap, + RE["null"] => :wrap, + + # 34.5 −3.62 +123.6 4. −.002 0.0 are all valid reals + /(\-|\+?)(\d+)\.(\d+)/ => :wrap_real, + /(\-|\+?)(\d+)\./ => :wrap_real, + /(\-|\+?)\.(\d+)/ => :wrap_real, + /\-?(\d+)/ => :wrap_int, + + RE["obj"] => :wrap, + RE["endobj"] => :wrap, + RE["stream"] => :wrap, + RE["endstream"] => :wrap, + + /\s+/ => :wrap_whitespace, + /./ => :garbage, + } + + STRING_ESCAPES = { + "\r" => "\n", + "\n\r" => "\n", + "\r\n" => "\n", + "\\n" => "\n", + "\\r" => "\r", + "\\t" => "\t", + "\\b" => "\b", + "\\f" => "\f", + "\\(" => "(", + "\\)" => ")", + "\\\\" => "\\", + "\\\n" => "", + } + 0.upto(9) { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr } + 0.upto(99) { |n| STRING_ESCAPES["\\0" + n.to_s] = ("0"+n.to_s).oct.chr } + 0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s] = n.to_s.oct.chr } + + def wrap_real(pattern) + [:real, @sc.scan(pattern).to_f] + end + + def wrap_int(pattern) + [:int, @sc.scan(pattern).to_i] + end + + def wrap_whitespace(pattern) + @sc.scan(pattern) + [:whitespace, nil] + end + + def wrap(pattern) + [:lit, @sc.scan(pattern).to_sym] + end + + def consume!(pattern, method_name) + at = @sc.pos + unless @sc.check(pattern) + $stderr.puts " : #{pattern} -> #{method_name} @#{at}: will scan #{@sc.peek(8).inspect}..." + return false + end + $stderr.puts "M: #{pattern} -> #{method_name} @#{at}: will scan #{@sc.peek(8).inspect}..." + result = send(method_name, pattern) + @token_stream << result unless result == [:whitespace, nil] + true + end + + def parse_ref(start_pattern) + [:ref, @sc.scan(start_pattern)] + end + + def parse_array(start_pattern) + @sc.scan(start_pattern) # consume [ + dict_open_at = @token_stream.length + walk_scanner(RE["]"]) + raise Malformed, "Array did not terminate" unless @token_stream.pop == :terminator + array_items = @token_stream.pop(@token_stream.length - dict_open_at) + [:array, array_items] + end + + def parse_dictionary(start_pattern) + @sc.scan(start_pattern) # consume << + dict_open_at = @token_stream.length + walk_scanner(RE[">>"]) + raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator + dict_items = @token_stream.pop(@token_stream.length - dict_open_at) + [:dict, dict_items] + end + + def parse_hex_string(start_pattern) + str = @sc.scan(/<[0-9a-f]+>/i) + raise Malformed, "Malformed hex string at #{@sc.pos}" unless str + + str << "0" unless str.bytesize % 2 == 0 + hex_str = str.scan(/../).map {|i| i.hex.chr}.join + [:hex_string, hex_str] + end + + def parse_string(start_pattern) + rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped ) + raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string + rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| + STRING_ESCAPES[match] || "" + end + end + + def parse_pdf_name(start_pattern) + letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join + name = @sc.scan(/\/[#{letters}\d]+/) + raise Malformed, "Expected a well-formed PDF name at #{@sc.pos} but could not recover any" unless name + [:name, name] + end + + def garbage(*) + raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one" + end + + def walk_scanner(halt_at_pattern) + (@sc.string.bytesize - @sc.pos).times do + # Terminate if EOS reached + break if @sc.eos? + + # Terminate early + if halt_at_pattern && halted = @sc.scan(halt_at_pattern) + @token_stream << :terminator + return + end + + # Walk through STRATEGIES and stop iterating on first non-false call to consume! + STRATEGIES.find do |pattern, method_name| + consume!(pattern, method_name) + end + end + end + + def parse(str) + @sc = StringScanner.new(str) + @token_stream = [] + walk_scanner(_stop_at_pattern = nil) + @token_stream + end +end diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb index 678049b9..6ab6fe87 100644 --- a/spec/parsers/pdf_parser/object_parser_spec.rb +++ b/spec/parsers/pdf_parser/object_parser_spec.rb @@ -1,177 +1,15 @@ require 'spec_helper' - -class NuObjectParser - Malformed = Class.new(RuntimeError) - RE = ->(str) { /#{Regexp.escape(str)}/ } - STRATEGIES = { - RE["/"] => :parse_pdf_name, - RE["<<"] => :parse_dictionary, - RE["["] => :parse_array, - RE["("] => :parse_string, - RE["<"] => :parse_hex_string, - /\d+ \d+ R/ => :parse_ref, - - RE["true"] => :wrap, - RE["false"] => :wrap, - RE["null"] => :wrap, - - /\-?(\d+)\.(\d+)/ => :wrap_real, - /\-?(\d+)/ => :wrap_int, - - RE["obj"] => :wrap, - RE["endobj"] => :wrap, - RE["stream"] => :wrap, - RE["endstream"] => :wrap, -# RE[">>"] => :wrap, -# RE["]"] => :wrap, -# RE[">"] => :wrap, -# RE[")"] => :wrap, - - /\s+/ => :wrap_whitespace, - /./ => :garbage, - } - - STRING_ESCAPES = { - "\r" => "\n", - "\n\r" => "\n", - "\r\n" => "\n", - "\\n" => "\n", - "\\r" => "\r", - "\\t" => "\t", - "\\b" => "\b", - "\\f" => "\f", - "\\(" => "(", - "\\)" => ")", - "\\\\" => "\\", - "\\\n" => "", - } - 0.upto(9) { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr } - 0.upto(99) { |n| STRING_ESCAPES["\\0" + n.to_s] = ("0"+n.to_s).oct.chr } - 0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s] = n.to_s.oct.chr } - - def wrap_true(sc, pattern) - @sc.scan(pattern) - true - end - - def wrap_false(pattern) - @sc.scan(pattern) - false - end - - def wrap_nil(pattern) - @sc.scan(pattern) - nil - end - - def wrap_real(pattern) - @sc.scan(pattern).to_f - end - - def wrap_int(pattern) - @sc.scan(pattern).to_i - end - - def wrap_whitespace(pattern) - @sc.scan(pattern) - :whitespace - end - - def wrap(pattern) - data = @sc.scan(pattern) - data.to_sym - end - - def consume!(pattern, method_name) - return unless @sc.check(pattern) - at = @sc.pos - result = send(method_name, pattern) - @token_stream << result unless result == :whitespace - true - end - - def parse_ref(start_pattern) - [:ref, @sc.scan(start_pattern)] - end - - def parse_array(start_pattern) - @sc.scan(start_pattern) # consume [ - dict_open_at = @token_stream.length - walk_scanner(RE["]"]) - raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator - array_items = @token_stream.pop(@token_stream.length - dict_open_at) - [:array, array_items] - end - - def parse_dictionary(start_pattern) - @sc.scan(start_pattern) # consume << - dict_open_at = @token_stream.length - walk_scanner(RE[">>"]) - raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator - dict_items = @token_stream.pop(@token_stream.length - dict_open_at) - [:dict, dict_items] - end - - def parse_hex_string(start_pattern) - str = @sc.scan(/<[0-9a-f]+>/i) - raise Malformed, "Malformed hex string at #{@sc.pos}" unless str - - str << "0" unless str.size % 2 == 0 - hex_str = str.scan(/../).map {|i| i.hex.chr}.join - [:hex_string, hex_str] - end - - def parse_string(start_pattern) - rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped ) - raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string - rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| - STRING_ESCAPES[match] || "" - end - end - - def parse_pdf_name(start_pattern) - letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join + "/" - name = @sc.scan(/\/[#{letters}\d]+/) - raise Malformed, "Expected a well-formed PDF name at #{@sc.pos} but could not recover any" unless name - [:name, name] - end - - def garbage(*) - raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one" - end - - def walk_scanner(halt_at_pattern) - (@sc.string.bytesize - @sc.pos).times do - # Terminate if EOS reached - break if @sc.eos? - - # Terminate early - if halt_at_pattern && halted = @sc.scan(halt_at_pattern) - @token_stream << :terminator - return - end - - # Walk through STRATEGIES and stop iterating on first non-false call to consume! - STRATEGIES.find do |pattern, method_name| - consume!(pattern, method_name) - end - end - end - - def parse(str) - @sc = StringScanner.new(str) - @token_stream = [] - walk_scanner(_stop_at_pattern = nil) - @token_stream - end -end +require_relative 'nu_object_parser' describe 'Object parser' do - let(:fixture_paths) { Dir.glob(__dir__ + '/*.pdfobj').sort } - - it 'scans the extracted object definitions from the corpus' do + describe 'with extracted objects from corpus' do + fixture_paths = Dir.glob(__dir__ + '/*.pdfobj').sort fixture_paths.each do |path| - result = NuObjectParser.new.parse(File.read(path)) + it "scans #{File.basename(path)}" do + result = NuObjectParser.new.parse(File.read(path)) + require 'pp' + pp result + end end end @@ -184,11 +22,11 @@ def parse(str) [:array, [ [:dict, [ [:name, "/Name"], "Jim", - [:name, "/Age"], 39, + [:name, "/Age"], [:int, 39], [:name, "/Children"], [:array, ["Heather", "Timothy", "Rebecca"]]] ], - 22, - 44.55] + [:int, 22], + [:real, 44.55]] ] ] ) @@ -197,7 +35,7 @@ def parse(str) it 'scans a simple dictionary with strings and ints as values' do result = NuObjectParser.new.parse('<>') expect(result).to eq( - [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], 25]]] + [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], [:int, 25]]]] ) end @@ -208,28 +46,40 @@ def parse(str) /Age 25>>') expect(result).to eq( - [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], 25]]] + [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], [:int, 25]]]] + ) + end + + it 'parses all kinds of reals' do + result = NuObjectParser.new.parse('34.5 -3.62 +123.6 4. -.002 0.0') + expect(result).to eq( + [[:real, 34.5], [:real, -3.62], [:real, 123.6], [:real, 4.0], [:real, -0.002], [:real, 0.0]] ) end it 'parses an array of integers' do result = NuObjectParser.new.parse('[1 2 3 4]') expect(result).to eq( - [[:array, [1, 2, 3, 4]]] + [[:array, [[:int, 1], [:int, 2], [:int, 3], [:int, 4]]]] ) end it 'scans an array of integers with one object ref in the middle' do result = NuObjectParser.new.parse('[1 20 00 R 3]') expect(result).to eq( - [[:array, [1, [:ref, "20 00 R"], 3]]] + [[:array, [[:int, 1], [:ref, "20 00 R"], [:int, 3]]]] ) end it 'scans an array of names' do result = NuObjectParser.new.parse('[ /Type /Color /Medium/Rare ]') expect(result).to eq( - [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium/Rare"]]]] + [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium"], [:name, "/Rare"]]]] + ) + + result = NuObjectParser.new.parse('[/Type/Color/Medium/Rare]') + expect(result).to eq( + [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium"], [:name, "/Rare"]]]] ) end @@ -240,6 +90,13 @@ def parse(str) ) end + it 'handles paired braces in strings escapes' do + result = NuObjectParser.new.parse("(Foo () bar and (baz))") + expect(result).to eq( + ["Foo (with some bars)"] + ) + end + it 'detects an unterminated string' do expect { NuObjectParser.new.parse("(Hello there") From 1d451359f7d556c4b07e40a6ae77140fb0847774 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Thu, 14 Jun 2018 14:21:07 +0200 Subject: [PATCH 07/18] Improve debug prints a little --- spec/parsers/pdf_parser/nu_object_parser.rb | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb index 300d1a48..98b087df 100644 --- a/spec/parsers/pdf_parser/nu_object_parser.rb +++ b/spec/parsers/pdf_parser/nu_object_parser.rb @@ -65,11 +65,8 @@ def wrap(pattern) def consume!(pattern, method_name) at = @sc.pos - unless @sc.check(pattern) - $stderr.puts " : #{pattern} -> #{method_name} @#{at}: will scan #{@sc.peek(8).inspect}..." - return false - end - $stderr.puts "M: #{pattern} -> #{method_name} @#{at}: will scan #{@sc.peek(8).inspect}..." + return false unless @sc.check(pattern) + debug { "M: #{method_name} @#{at}: 8 chars after scan pointer #{@sc.peek(8).inspect}" } result = send(method_name, pattern) @token_stream << result unless result == [:whitespace, nil] true @@ -149,4 +146,8 @@ def parse(str) walk_scanner(_stop_at_pattern = nil) @token_stream end + + def debug + $stderr.puts(yield) + end end From c02f5bc617fb60768bbb3dab14384915b2401c94 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Thu, 14 Jun 2018 14:27:41 +0200 Subject: [PATCH 08/18] Explain the loop limiter --- spec/parsers/pdf_parser/nu_object_parser.rb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb index 98b087df..ed0d2245 100644 --- a/spec/parsers/pdf_parser/nu_object_parser.rb +++ b/spec/parsers/pdf_parser/nu_object_parser.rb @@ -123,6 +123,11 @@ def garbage(*) end def walk_scanner(halt_at_pattern) + # Limit the iterations to AT MOST (!) once per + # remaining byte to parse. This ensures we won't + # have parsing enter an infinite loop where we expect + # the string scanner to have advanced at least a byte forward + # but it would sit on the same offset indifinitely. (@sc.string.bytesize - @sc.pos).times do # Terminate if EOS reached break if @sc.eos? @@ -134,6 +139,8 @@ def walk_scanner(halt_at_pattern) end # Walk through STRATEGIES and stop iterating on first non-false call to consume! + # STRATEGIES are arranged by order of specificity, so for most iterations + # somethign meaningful should be hit relatively quickly STRATEGIES.find do |pattern, method_name| consume!(pattern, method_name) end From 56548a6aa2c6933a1ad43c4723fdfa33d9c576b4 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Thu, 14 Jun 2018 15:37:38 +0200 Subject: [PATCH 09/18] Improve handling of names --- spec/parsers/pdf_parser/nu_object_parser.rb | 36 ++++++++++++++++--- spec/parsers/pdf_parser/object_parser_spec.rb | 32 +++++++++++++++++ 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb index ed0d2245..e8c01915 100644 --- a/spec/parsers/pdf_parser/nu_object_parser.rb +++ b/spec/parsers/pdf_parser/nu_object_parser.rb @@ -1,13 +1,31 @@ class NuObjectParser Malformed = Class.new(RuntimeError) RE = ->(str) { /#{Regexp.escape(str)}/ } + + NAME_RE = begin + # The ASCII subset permissible for PDF name values + printable_ascii = (32..126).to_a + printable_ascii.delete(' '.ord) + printable_ascii.delete('['.ord) + printable_ascii.delete(']'.ord) + printable_ascii.delete('<'.ord) + printable_ascii.delete('>'.ord) + printable_ascii.delete('('.ord) + printable_ascii.delete(')'.ord) + printable_ascii.delete('/'.ord) + printable_ascii.delete('\\'.ord) + exact_char_class = printable_ascii.map(&:chr).join + + /\/[#{exact_char_class}]{0,}/ + end + STRATEGIES = { - RE["/"] => :parse_pdf_name, RE["<<"] => :parse_dictionary, RE["["] => :parse_array, RE["("] => :parse_string, RE["<"] => :parse_hex_string, /\d+ \d+ R/ => :parse_ref, + NAME_RE => :parse_pdf_name, RE["true"] => :wrap, RE["false"] => :wrap, @@ -28,6 +46,7 @@ class NuObjectParser /./ => :garbage, } + # Permitted character escapes. There aren't _that_ many so we can use a table STRING_ESCAPES = { "\r" => "\n", "\n\r" => "\n", @@ -42,6 +61,8 @@ class NuObjectParser "\\\\" => "\\", "\\\n" => "", } + + # Octal character escapes that look like \001 etc 0.upto(9) { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr } 0.upto(99) { |n| STRING_ESCAPES["\\0" + n.to_s] = ("0"+n.to_s).oct.chr } 0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s] = n.to_s.oct.chr } @@ -104,6 +125,9 @@ def parse_hex_string(start_pattern) end def parse_string(start_pattern) + # This is murder. PDF allows paired braces to be put into a string literal + # without any escaping. This means that "(Horrible file format (with a cherry on top))" + # is a valid string. Needs attention. rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped ) raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| @@ -112,10 +136,12 @@ def parse_string(start_pattern) end def parse_pdf_name(start_pattern) - letters = ('a'..'z').to_a.join + ('A'..'Z').to_a.join - name = @sc.scan(/\/[#{letters}\d]+/) - raise Malformed, "Expected a well-formed PDF name at #{@sc.pos} but could not recover any" unless name - [:name, name] + name = @sc.scan(start_pattern) + # Replace #023 hex codes with the corresponding chars + name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |hex_code| + $1.to_i(16).chr + end + [:name, name_sans_escapes] end def garbage(*) diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb index 6ab6fe87..9ba84494 100644 --- a/spec/parsers/pdf_parser/object_parser_spec.rb +++ b/spec/parsers/pdf_parser/object_parser_spec.rb @@ -83,6 +83,38 @@ ) end + it 'handles names' do + names_str = %( + /Name1 + /ASomewhatLongerName /A;Name_With-Various***Characters? /1.2 + /$$ + /@pattern + /.notdef + /Adobe#20Green + /PANTONE#205757#20CV + /paired#28#29parentheses + /The_Key_of_F#23_Minor + /A#42 + / + ) + result = NuObjectParser.new.parse(names_str) + expect(result).to eq([ + [:name, "/Name1"], + [:name, "/ASomewhatLongerName"], + [:name, "/A;Name_With-Various***Characters?"], + [:name, "/1.2"], + [:name, "/$$"], + [:name, "/@pattern"], + [:name, "/.notdef"], + [:name, "/Adobe Green"], + [:name, "/PANTONE 5757 CV"], + [:name, "/paired()parentheses"], + [:name, "/The_Key_of_F#_Minor"], + [:name, "/AB"], + [:name, "/"] + ]) + end + it 'handles string escapes' do result = NuObjectParser.new.parse("(Foo \\(with some bars\\))") expect(result).to eq( From 84e2c04eb795e3a4b062f5dc7eb40b08bd740850 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Thu, 14 Jun 2018 16:30:59 +0200 Subject: [PATCH 10/18] Let's do some rubocop here --- spec/parsers/pdf_parser/nu_object_parser.rb | 72 +++++++++---------- spec/parsers/pdf_parser/object_parser_spec.rb | 71 +++++++++--------- 2 files changed, 73 insertions(+), 70 deletions(-) diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb index e8c01915..ffcad3ba 100644 --- a/spec/parsers/pdf_parser/nu_object_parser.rb +++ b/spec/parsers/pdf_parser/nu_object_parser.rb @@ -15,21 +15,21 @@ class NuObjectParser printable_ascii.delete('/'.ord) printable_ascii.delete('\\'.ord) exact_char_class = printable_ascii.map(&:chr).join - + /\/[#{exact_char_class}]{0,}/ end STRATEGIES = { - RE["<<"] => :parse_dictionary, - RE["["] => :parse_array, - RE["("] => :parse_string, - RE["<"] => :parse_hex_string, + RE['<<'] => :parse_dictionary, + RE['['] => :parse_array, + RE['('] => :parse_string, + RE['<'] => :parse_hex_string, /\d+ \d+ R/ => :parse_ref, - NAME_RE => :parse_pdf_name, + NAME_RE => :parse_pdf_name, - RE["true"] => :wrap, - RE["false"] => :wrap, - RE["null"] => :wrap, + RE['true'] => :wrap, + RE['false'] => :wrap, + RE['null'] => :wrap, # 34.5 −3.62 +123.6 4. −.002 0.0 are all valid reals /(\-|\+?)(\d+)\.(\d+)/ => :wrap_real, @@ -37,10 +37,10 @@ class NuObjectParser /(\-|\+?)\.(\d+)/ => :wrap_real, /\-?(\d+)/ => :wrap_int, - RE["obj"] => :wrap, - RE["endobj"] => :wrap, - RE["stream"] => :wrap, - RE["endstream"] => :wrap, + RE['obj'] => :wrap, + RE['endobj'] => :wrap, + RE['stream'] => :wrap, + RE['endstream'] => :wrap, /\s+/ => :wrap_whitespace, /./ => :garbage, @@ -51,21 +51,21 @@ class NuObjectParser "\r" => "\n", "\n\r" => "\n", "\r\n" => "\n", - "\\n" => "\n", - "\\r" => "\r", - "\\t" => "\t", - "\\b" => "\b", - "\\f" => "\f", - "\\(" => "(", - "\\)" => ")", - "\\\\" => "\\", - "\\\n" => "", + '\\n' => "\n", + '\\r' => "\r", + '\\t' => "\t", + '\\b' => "\b", + '\\f' => "\f", + '\\(' => '(', + '\\)' => ')', + '\\\\' => '\\', + "\\\n" => '', } # Octal character escapes that look like \001 etc - 0.upto(9) { |n| STRING_ESCAPES["\\00" + n.to_s] = ("00"+n.to_s).oct.chr } - 0.upto(99) { |n| STRING_ESCAPES["\\0" + n.to_s] = ("0"+n.to_s).oct.chr } - 0.upto(377) { |n| STRING_ESCAPES["\\" + n.to_s] = n.to_s.oct.chr } + 0.upto(9) { |n| STRING_ESCAPES['\\00' + n.to_s] = ('00' + n.to_s).oct.chr } + 0.upto(99) { |n| STRING_ESCAPES['\\0' + n.to_s] = ('0' + n.to_s).oct.chr } + 0.upto(377) { |n| STRING_ESCAPES['\\' + n.to_s] = n.to_s.oct.chr } def wrap_real(pattern) [:real, @sc.scan(pattern).to_f] @@ -100,8 +100,8 @@ def parse_ref(start_pattern) def parse_array(start_pattern) @sc.scan(start_pattern) # consume [ dict_open_at = @token_stream.length - walk_scanner(RE["]"]) - raise Malformed, "Array did not terminate" unless @token_stream.pop == :terminator + walk_scanner(RE[']']) + raise Malformed, 'Array did not terminate' unless @token_stream.pop == :terminator array_items = @token_stream.pop(@token_stream.length - dict_open_at) [:array, array_items] end @@ -109,36 +109,36 @@ def parse_array(start_pattern) def parse_dictionary(start_pattern) @sc.scan(start_pattern) # consume << dict_open_at = @token_stream.length - walk_scanner(RE[">>"]) - raise Malformed, "Dictionary did not terminate" unless @token_stream.pop == :terminator + walk_scanner(RE['>>']) + raise Malformed, 'Dictionary did not terminate' unless @token_stream.pop == :terminator dict_items = @token_stream.pop(@token_stream.length - dict_open_at) [:dict, dict_items] end - def parse_hex_string(start_pattern) + def parse_hex_string(_start_pattern) str = @sc.scan(/<[0-9a-f]+>/i) raise Malformed, "Malformed hex string at #{@sc.pos}" unless str - str << "0" unless str.bytesize % 2 == 0 - hex_str = str.scan(/../).map {|i| i.hex.chr}.join + str << '0' unless str.bytesize.even? + hex_str = str.scan(/../).map { |i| i.hex.chr }.join [:hex_string, hex_str] end - def parse_string(start_pattern) + def parse_string(_start_pattern) # This is murder. PDF allows paired braces to be put into a string literal # without any escaping. This means that "(Horrible file format (with a cherry on top))" # is a valid string. Needs attention. rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped ) raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| - STRING_ESCAPES[match] || "" + STRING_ESCAPES[match] || '' end end def parse_pdf_name(start_pattern) name = @sc.scan(start_pattern) # Replace #023 hex codes with the corresponding chars - name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |hex_code| + name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |_hex_code| $1.to_i(16).chr end [:name, name_sans_escapes] @@ -181,6 +181,6 @@ def parse(str) end def debug - $stderr.puts(yield) + warn(yield) end end diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb index 9ba84494..aad2997a 100644 --- a/spec/parsers/pdf_parser/object_parser_spec.rb +++ b/spec/parsers/pdf_parser/object_parser_spec.rb @@ -19,14 +19,18 @@ result = parser.parse(obj) expect(result).to eq( [ - [:array, [ - [:dict, [ - [:name, "/Name"], "Jim", - [:name, "/Age"], [:int, 39], - [:name, "/Children"], [:array, ["Heather", "Timothy", "Rebecca"]]] - ], - [:int, 22], - [:real, 44.55]] + [ + :array, [ + [ + :dict, [ + [:name, '/Name'], 'Jim', + [:name, '/Age'], [:int, 39], + [:name, '/Children'], [:array, ['Heather', 'Timothy', 'Rebecca']] + ] + ], + [:int, 22], + [:real, 44.55] + ] ] ] ) @@ -35,7 +39,7 @@ it 'scans a simple dictionary with strings and ints as values' do result = NuObjectParser.new.parse('<>') expect(result).to eq( - [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], [:int, 25]]]] + [[:dict, [[:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 25]]]] ) end @@ -46,7 +50,7 @@ /Age 25>>') expect(result).to eq( - [[:dict, [[:name, "/Name"], "Jim", [:name, "/Age"], [:int, 25]]]] + [[:dict, [[:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 25]]]] ) end @@ -67,19 +71,19 @@ it 'scans an array of integers with one object ref in the middle' do result = NuObjectParser.new.parse('[1 20 00 R 3]') expect(result).to eq( - [[:array, [[:int, 1], [:ref, "20 00 R"], [:int, 3]]]] + [[:array, [[:int, 1], [:ref, '20 00 R'], [:int, 3]]]] ) end it 'scans an array of names' do result = NuObjectParser.new.parse('[ /Type /Color /Medium/Rare ]') expect(result).to eq( - [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium"], [:name, "/Rare"]]]] + [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]] ) result = NuObjectParser.new.parse('[/Type/Color/Medium/Rare]') expect(result).to eq( - [[:array, [[:name, "/Type"], [:name, "/Color"], [:name, "/Medium"], [:name, "/Rare"]]]] + [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]] ) end @@ -99,51 +103,51 @@ ) result = NuObjectParser.new.parse(names_str) expect(result).to eq([ - [:name, "/Name1"], - [:name, "/ASomewhatLongerName"], - [:name, "/A;Name_With-Various***Characters?"], - [:name, "/1.2"], - [:name, "/$$"], - [:name, "/@pattern"], - [:name, "/.notdef"], - [:name, "/Adobe Green"], - [:name, "/PANTONE 5757 CV"], - [:name, "/paired()parentheses"], - [:name, "/The_Key_of_F#_Minor"], - [:name, "/AB"], - [:name, "/"] + [:name, '/Name1'], + [:name, '/ASomewhatLongerName'], + [:name, '/A;Name_With-Various***Characters?'], + [:name, '/1.2'], + [:name, '/$$'], + [:name, '/@pattern'], + [:name, '/.notdef'], + [:name, '/Adobe Green'], + [:name, '/PANTONE 5757 CV'], + [:name, '/paired()parentheses'], + [:name, '/The_Key_of_F#_Minor'], + [:name, '/AB'], + [:name, '/'] ]) end it 'handles string escapes' do - result = NuObjectParser.new.parse("(Foo \\(with some bars\\))") + result = NuObjectParser.new.parse('(Foo \\(with some bars\\))') expect(result).to eq( - ["Foo (with some bars)"] + ['Foo (with some bars)'] ) end it 'handles paired braces in strings escapes' do - result = NuObjectParser.new.parse("(Foo () bar and (baz))") + result = NuObjectParser.new.parse('(Foo () bar and (baz))') expect(result).to eq( - ["Foo (with some bars)"] + ['Foo (with some bars)'] ) end it 'detects an unterminated string' do expect { - NuObjectParser.new.parse("(Hello there") + NuObjectParser.new.parse('(Hello there') }.to raise_error(/did not terminate/) end it 'detects an unterminated array' do expect { - NuObjectParser.new.parse("[") + NuObjectParser.new.parse('[') }.to raise_error(/did not terminate/) end it 'detects an unterminated dictionary' do expect { - NuObjectParser.new.parse("<< /Ohai") + NuObjectParser.new.parse('<< /Ohai') }.to raise_error(/did not terminate/) end @@ -164,5 +168,4 @@ end end end - end From 91af38361ea68420e7366cc8066a863e608e0d39 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Thu, 14 Jun 2018 16:36:12 +0200 Subject: [PATCH 11/18] Meh --- spec/parsers/pdf_parser/nu_object_parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb index ffcad3ba..55d814a0 100644 --- a/spec/parsers/pdf_parser/nu_object_parser.rb +++ b/spec/parsers/pdf_parser/nu_object_parser.rb @@ -130,7 +130,7 @@ def parse_string(_start_pattern) # is a valid string. Needs attention. rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped ) raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string - rest_of_string[1..-2].gsub (/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| + rest_of_string[1..-2].gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| STRING_ESCAPES[match] || '' end end From b268fbebd4e2773967dc5b67c5e376a76e7b5a2a Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Fri, 15 Jun 2018 11:53:54 +0200 Subject: [PATCH 12/18] Getting there --- lib/parsers/pdf_parser.rb | 21 +++-- spec/parsers/pdf_parser/nu_object_parser.rb | 78 ++++++++++++++++--- spec/parsers/pdf_parser/object_parser_spec.rb | 30 ++++--- 3 files changed, 97 insertions(+), 32 deletions(-) diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb index 9510236b..0ff515ca 100644 --- a/lib/parsers/pdf_parser.rb +++ b/lib/parsers/pdf_parser.rb @@ -14,7 +14,7 @@ class FormatParser::PDFParser # this. The only way of solving this correctly is by adding # different types of PDF's in the specs. # - EOF_MARKER = '%EOF' + EOF_MARKER = '%EOF' def call(io) io = FormatParser::IOConstraint.new(io) @@ -22,7 +22,7 @@ def call(io) return unless safe_read(io, 9) =~ PDF_MARKER io.seek(io.size - 5) -# return unless safe_read(io, 5) == '%%EOF' + # return unless safe_read(io, 5) == '%%EOF' xref_offset = locate_xref_table_offset(io) return unless xref_offset @@ -43,18 +43,17 @@ def call(io) # Then we need to actually go in, read the object and parse the dictionary - luckily # this is not that much trouble and we can read the entire object, since it is small. # So let's get at it. - next if xref.length_limit > 1024 # Skip objects which are too large, they won't be headers anyway + next if xref.length_limit > 1024 # Skip objects which are too large, they won't be headers anyway # Do a quickie detection reading just a tiny piece of the object obj_header = safe_read(io, 32) - if obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog') - io.seek(xref.offset) - object_buf = io.read(xref.length_limit) - parse_object_with_dictionary(object_buf) - end + next unless obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog') + io.seek(xref.offset) + object_buf = io.read(xref.length_limit) + parse_object_with_dictionary(object_buf) end - raise "nope" + raise 'nope' FormatParser::Document.new( format: :pdf, page_count: attributes[:page_count] @@ -107,7 +106,7 @@ def parse_xref_table(io) end # Reject all disabled objects - xref_table.reject! {|e| e.entry_type == 'f' } + xref_table.reject! { |e| e.entry_type == 'f' } # Sort sequentially in ascending offset in document order xref_table.sort_by!(&:offset) @@ -153,7 +152,7 @@ def max(*of_items) end def parse_object_with_dictionary(str) - File.open(Digest::SHA1.hexdigest(str) + '.pdfobj', 'wb') {|f| f << str } + File.open(Digest::SHA1.hexdigest(str) + '.pdfobj', 'wb') { |f| f << str } end FormatParser.register_parser self, natures: :document, formats: :pdf diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/spec/parsers/pdf_parser/nu_object_parser.rb index 55d814a0..659be531 100644 --- a/spec/parsers/pdf_parser/nu_object_parser.rb +++ b/spec/parsers/pdf_parser/nu_object_parser.rb @@ -27,9 +27,9 @@ class NuObjectParser /\d+ \d+ R/ => :parse_ref, NAME_RE => :parse_pdf_name, - RE['true'] => :wrap, - RE['false'] => :wrap, - RE['null'] => :wrap, + RE['true'] => :wrap_lit, + RE['false'] => :wrap_lit, + RE['null'] => :wrap_lit, # 34.5 −3.62 +123.6 4. −.002 0.0 are all valid reals /(\-|\+?)(\d+)\.(\d+)/ => :wrap_real, @@ -80,7 +80,7 @@ def wrap_whitespace(pattern) [:whitespace, nil] end - def wrap(pattern) + def wrap_lit(pattern) [:lit, @sc.scan(pattern).to_sym] end @@ -124,15 +124,36 @@ def parse_hex_string(_start_pattern) [:hex_string, hex_str] end - def parse_string(_start_pattern) + def parse_string(opening_brace_pattern) # This is murder. PDF allows paired braces to be put into a string literal # without any escaping. This means that "(Horrible file format (with a cherry on top))" # is a valid string. Needs attention. - rest_of_string = @sc.scan_until(/[^\\]\)/) # consume everything starting with ( and upto a non-escaped ) - raise Malformed, "String did not terminate (started at at #{@sc.pos})" unless rest_of_string - rest_of_string[1..-2].gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| + @sc.scan(opening_brace_pattern) # just the "(" + str = "" + count = 1 + bytes_remaining_to_scan.times do + break if @sc.eos? || count == 0 + + byte = @sc.scan(/./) + if byte.nil? + count = 0 # unbalanced parens + elsif byte == 0x5C.chr # "\" + str << byte << @sc.scan(/\./).to_s + elsif byte == 0x28.chr # "(" + str << "(" + count += 1 + elsif byte == 0x29.chr # ")" + count -= 1 + str << ")" unless count == 0 + else + str << byte unless count == 0 + end + break if count == 0 + end + unescaped = str.gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| STRING_ESCAPES[match] || '' end + [:str, unescaped] end def parse_pdf_name(start_pattern) @@ -148,13 +169,17 @@ def garbage(*) raise Malformed, "Expected a meaningful token at #{@sc.pos} but did not encounter one" end + def bytes_remaining_to_scan + @sc.string.bytesize - @sc.pos + end + def walk_scanner(halt_at_pattern) # Limit the iterations to AT MOST (!) once per # remaining byte to parse. This ensures we won't # have parsing enter an infinite loop where we expect # the string scanner to have advanced at least a byte forward # but it would sit on the same offset indifinitely. - (@sc.string.bytesize - @sc.pos).times do + bytes_remaining_to_scan.times do # Terminate if EOS reached break if @sc.eos? @@ -173,13 +198,46 @@ def walk_scanner(halt_at_pattern) end end - def parse(str) + def tokenize(str) @sc = StringScanner.new(str) @token_stream = [] walk_scanner(_stop_at_pattern = nil) @token_stream end + class PDFRef < Struct.new(:object_id, :object_gen) + def initialize(str) + super(*str.scan(/(\d+) (\d+) R/).first) + end + end + + class PDFName < Struct.new(:name) + end + + def parse(str) + ast = tokenize(str) + unwrap_token = ->(token) { + if token.length == 2 && token.first.is_a?(Symbol) + token_type, token_value = token + case token_type + when :dict + unwrapped_values = token_value.map(&unwrap_token) + keys, values = unwrapped_values.partition.with_index {|_, i| i % 2 == 0 } + Hash[keys.zip(values)] + when :array + token_value.map(&unwrap_token) + when :name + PDFName.new(token_value) + when :lit + {:true => true, :false => false, :null => nil}.fetch(token_value) + end + else + token + end + } + unwrap_token.(ast) + end + def debug warn(yield) end diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/object_parser_spec.rb index aad2997a..a97a0c2d 100644 --- a/spec/parsers/pdf_parser/object_parser_spec.rb +++ b/spec/parsers/pdf_parser/object_parser_spec.rb @@ -25,7 +25,12 @@ :dict, [ [:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 39], - [:name, '/Children'], [:array, ['Heather', 'Timothy', 'Rebecca']] + [:name, '/Children'], + [:array, [ + [:str, 'Heather'], + [:str, 'Timothy'], + [:str, 'Rebecca'] + ]] ] ], [:int, 22], @@ -119,17 +124,20 @@ ]) end - it 'handles string escapes' do - result = NuObjectParser.new.parse('(Foo \\(with some bars\\))') + it 'handles paired braces and strings escapes' do + result = NuObjectParser.new.parse(' + (Foo \\(with some bars\\)) + (Foo () bar and (baz)) + (Foo (with some bars)) + (((()))) + ') expect(result).to eq( - ['Foo (with some bars)'] - ) - end - - it 'handles paired braces in strings escapes' do - result = NuObjectParser.new.parse('(Foo () bar and (baz))') - expect(result).to eq( - ['Foo (with some bars)'] + [ + [:str, "Foo (with some bars)"], + [:str, "Foo () bar and (baz)"], + [:str, "Foo (with some bars)"], + [:str, "((()))"] + ] ) end From db3fd70ab0eed9898484efdb02b4e9751cc6bd25 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Fri, 15 Jun 2018 12:56:52 +0200 Subject: [PATCH 13/18] Move things to all the right places --- lib/parsers/pdf_parser.rb | 23 ++-- .../parsers/pdf_parser/tokenizer.rb | 112 +++++------------- lib/parsers/pdf_parser/transformer.rb | 107 +++++++++++++++++ ...bject_parser_spec.rb => tokenizer_spec.rb} | 92 +++++++------- 4 files changed, 197 insertions(+), 137 deletions(-) rename spec/parsers/pdf_parser/nu_object_parser.rb => lib/parsers/pdf_parser/tokenizer.rb (64%) create mode 100644 lib/parsers/pdf_parser/transformer.rb rename spec/parsers/pdf_parser/{object_parser_spec.rb => tokenizer_spec.rb} (58%) diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb index 0ff515ca..c2e61be6 100644 --- a/lib/parsers/pdf_parser.rb +++ b/lib/parsers/pdf_parser.rb @@ -1,4 +1,6 @@ class FormatParser::PDFParser + require_relative 'pdf_parser/tokenizer' + require_relative 'pdf_parser/transformer' include FormatParser::IOUtils # First 9 bytes of a PDF should be in this format, according to: @@ -50,7 +52,7 @@ def call(io) next unless obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog') io.seek(xref.offset) object_buf = io.read(xref.length_limit) - parse_object_with_dictionary(object_buf) + parse_pdf_object(object_buf) end raise 'nope' @@ -130,19 +132,20 @@ def pairwise(enum) end end - def read_until_linebreak(io, char_limit: 32) + def read_until_delimiter(io, delimiter:, char_limit: 32) buf = StringIO.new(''.b) char_limit.times do char = safe_read(io, 1).force_encoding(Encoding::BINARY) - if char == "\n" - break - else - buf << char - end + buf << char + break if buf.string.end_with?(delimiter) end buf.string.strip end + def read_until_linebreak(io, char_limit: 32) + read_until_delimiter(io, delimiter: "\n", char_limit: char_limit) + end + def min(*of_items) of_items.sort.shift end @@ -151,8 +154,10 @@ def max(*of_items) of_items.sort.pop end - def parse_object_with_dictionary(str) - File.open(Digest::SHA1.hexdigest(str) + '.pdfobj', 'wb') { |f| f << str } + def parse_pdf_object(str) + token_stream = Tokenizer.new.tokenize(str) + tree = Transformer.new.transform(token_stream) + $stderr.puts tree.inspect end FormatParser.register_parser self, natures: :document, formats: :pdf diff --git a/spec/parsers/pdf_parser/nu_object_parser.rb b/lib/parsers/pdf_parser/tokenizer.rb similarity index 64% rename from spec/parsers/pdf_parser/nu_object_parser.rb rename to lib/parsers/pdf_parser/tokenizer.rb index 659be531..c95ee33f 100644 --- a/spec/parsers/pdf_parser/nu_object_parser.rb +++ b/lib/parsers/pdf_parser/tokenizer.rb @@ -1,4 +1,4 @@ -class NuObjectParser +class FormatParser::PDFParser::Tokenizer Malformed = Class.new(RuntimeError) RE = ->(str) { /#{Regexp.escape(str)}/ } @@ -23,7 +23,7 @@ class NuObjectParser RE['<<'] => :parse_dictionary, RE['['] => :parse_array, RE['('] => :parse_string, - RE['<'] => :parse_hex_string, + /<[0-9a-f]+>/i => :parse_hex_string, /\d+ \d+ R/ => :parse_ref, NAME_RE => :parse_pdf_name, @@ -37,42 +37,24 @@ class NuObjectParser /(\-|\+?)\.(\d+)/ => :wrap_real, /\-?(\d+)/ => :wrap_int, - RE['obj'] => :wrap, - RE['endobj'] => :wrap, - RE['stream'] => :wrap, - RE['endstream'] => :wrap, + RE['obj'] => :wrap_lit, + # Use dirty trick to stop parsing if we encounter anything binary. This does not + # prevent us from reading ahead into the stream, but it does allow is to abort + # quicker + RE['endobj'] => :abort, + RE['stream'] => :abort, + RE['endstream'] => :abort, /\s+/ => :wrap_whitespace, /./ => :garbage, } - # Permitted character escapes. There aren't _that_ many so we can use a table - STRING_ESCAPES = { - "\r" => "\n", - "\n\r" => "\n", - "\r\n" => "\n", - '\\n' => "\n", - '\\r' => "\r", - '\\t' => "\t", - '\\b' => "\b", - '\\f' => "\f", - '\\(' => '(', - '\\)' => ')', - '\\\\' => '\\', - "\\\n" => '', - } - - # Octal character escapes that look like \001 etc - 0.upto(9) { |n| STRING_ESCAPES['\\00' + n.to_s] = ('00' + n.to_s).oct.chr } - 0.upto(99) { |n| STRING_ESCAPES['\\0' + n.to_s] = ('0' + n.to_s).oct.chr } - 0.upto(377) { |n| STRING_ESCAPES['\\' + n.to_s] = n.to_s.oct.chr } - def wrap_real(pattern) - [:real, @sc.scan(pattern).to_f] + [:real, @sc.scan(pattern)] end def wrap_int(pattern) - [:int, @sc.scan(pattern).to_i] + [:int, @sc.scan(pattern)] end def wrap_whitespace(pattern) @@ -115,13 +97,8 @@ def parse_dictionary(start_pattern) [:dict, dict_items] end - def parse_hex_string(_start_pattern) - str = @sc.scan(/<[0-9a-f]+>/i) - raise Malformed, "Malformed hex string at #{@sc.pos}" unless str - - str << '0' unless str.bytesize.even? - hex_str = str.scan(/../).map { |i| i.hex.chr }.join - [:hex_string, hex_str] + def parse_hex_string(start_pattern) + [:hex_string, @sc.scan(start_pattern)] end def parse_string(opening_brace_pattern) @@ -132,6 +109,7 @@ def parse_string(opening_brace_pattern) str = "" count = 1 bytes_remaining_to_scan.times do + # Terminate if EOS reached or once we encountered the outermost closing brace break if @sc.eos? || count == 0 byte = @sc.scan(/./) @@ -150,19 +128,12 @@ def parse_string(opening_brace_pattern) end break if count == 0 end - unescaped = str.gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| - STRING_ESCAPES[match] || '' - end - [:str, unescaped] + raise Malformed, "String did not terminate at #{@sc.pos}" if count > 0 + [:str, str] end def parse_pdf_name(start_pattern) - name = @sc.scan(start_pattern) - # Replace #023 hex codes with the corresponding chars - name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |_hex_code| - $1.to_i(16).chr - end - [:name, name_sans_escapes] + [:name, @sc.scan(start_pattern)] end def garbage(*) @@ -198,47 +169,24 @@ def walk_scanner(halt_at_pattern) end end - def tokenize(str) - @sc = StringScanner.new(str) - @token_stream = [] - walk_scanner(_stop_at_pattern = nil) - @token_stream + # Dirty thing we use to stop parsing as soon as we encounter a "stream", "xstream" + def abort(pattern) + str = @sc.scan(pattern) + debug { "X: Aborting tokenization at #{str.inspect} @#{@sc.pos}" } + throw :_abort_ end - class PDFRef < Struct.new(:object_id, :object_gen) - def initialize(str) - super(*str.scan(/(\d+) (\d+) R/).first) + def tokenize(str, verbose: false) + @verbose = verbose + @sc = StringScanner.new(str.force_encoding(Encoding::BINARY)) + @token_stream = [] + catch :_abort_ do + walk_scanner(_stop_at_pattern = nil) end - end - - class PDFName < Struct.new(:name) - end - - def parse(str) - ast = tokenize(str) - unwrap_token = ->(token) { - if token.length == 2 && token.first.is_a?(Symbol) - token_type, token_value = token - case token_type - when :dict - unwrapped_values = token_value.map(&unwrap_token) - keys, values = unwrapped_values.partition.with_index {|_, i| i % 2 == 0 } - Hash[keys.zip(values)] - when :array - token_value.map(&unwrap_token) - when :name - PDFName.new(token_value) - when :lit - {:true => true, :false => false, :null => nil}.fetch(token_value) - end - else - token - end - } - unwrap_token.(ast) + @token_stream end def debug - warn(yield) + warn(yield) if @verbose end end diff --git a/lib/parsers/pdf_parser/transformer.rb b/lib/parsers/pdf_parser/transformer.rb new file mode 100644 index 00000000..b25a990e --- /dev/null +++ b/lib/parsers/pdf_parser/transformer.rb @@ -0,0 +1,107 @@ +class FormatParser::PDFParser::Transformer + class PDFRef < Struct.new(:object_id, :object_gen) + def self.from_ref_str(str) + id_and_generation_str = str.scan(/(\d+) (\d+) R/).first + new(*id_and_generation_str.map(&:to_i)) + end + end + + class PDFName < Struct.new(:name) + end + + # Permitted character escapes. There aren't _that_ many so we can use a table + STRING_ESCAPES = { + "\r" => "\n", + "\n\r" => "\n", + "\r\n" => "\n", + '\\n' => "\n", + '\\r' => "\r", + '\\t' => "\t", + '\\b' => "\b", + '\\f' => "\f", + '\\(' => '(', + '\\)' => ')', + '\\\\' => '\\', + "\\\n" => '', + } + + # Octal character escapes that look like \001 etc + 0.upto(9) { |n| STRING_ESCAPES['\\00' + n.to_s] = ('00' + n.to_s).oct.chr } + 0.upto(99) { |n| STRING_ESCAPES['\\0' + n.to_s] = ('0' + n.to_s).oct.chr } + 0.upto(377) { |n| STRING_ESCAPES['\\' + n.to_s] = n.to_s.oct.chr } + + LITERAL_VALUES = { + :true => true, + :false => false, + :null => nil, + } + + def transform(tokens) + tokens.map {|t| unwrap(*t) } + end + + def unwrap(token_type, token_value) + case token_type + when :dict + unwrap_dict(token_value) + when :array + unwrap_array(token_value) + when :real + unwrap_real(token_value) + when :int + unwrap_int(token_value) + when :ref + unwrap_ref(token_value) + when :name + unwrap_name(token_value) + when :lit + unwrap_lit(token_value) + else + token_value + end + end + + def unwrap_real(value) + value.to_f + end + + def unwrap_int(value) + value.to_i + end + + def unwrap_dict(value) + unwrapped_values = value.map{|e| unwrap(*e) } + keys, values = unwrapped_values.partition.with_index {|_, i| i % 2 == 0 } + Hash[keys.zip(values)] + end + + def unwrap_lit(value) + LITERAL_VALUES.fetch(value, value.to_sym) + end + + def unwrap_ref(value) + PDFRef.from_ref_str(value) + end + + def unwrap_array(value) + value.map {|e| unwrap(*e) } + end + + def unwrap_hex_string(str) + str << '0' unless str.bytesize.even? + str.scan(/../).map { |i| i.hex.chr }.join + end + + def unwrap_string(str) + str.gsub(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match| + STRING_ESCAPES[match] || '' + end + end + + def unwrap_name(name) + # Replace #0xx hex codes with the corresponding chars + name_sans_escapes = name.gsub(/\#([\da-fA-F]{1,2})/) do |_hex_code| + $1.to_i(16).chr + end + end +end diff --git a/spec/parsers/pdf_parser/object_parser_spec.rb b/spec/parsers/pdf_parser/tokenizer_spec.rb similarity index 58% rename from spec/parsers/pdf_parser/object_parser_spec.rb rename to spec/parsers/pdf_parser/tokenizer_spec.rb index a97a0c2d..ec45f628 100644 --- a/spec/parsers/pdf_parser/object_parser_spec.rb +++ b/spec/parsers/pdf_parser/tokenizer_spec.rb @@ -1,12 +1,19 @@ require 'spec_helper' -require_relative 'nu_object_parser' -describe 'Object parser' do +describe FormatParser::PDFParser::Tokenizer do + def tokenize(str) + FormatParser::PDFParser::Tokenizer.new.tokenize(str) + end + + def tokenize_file_at(at_path) + FormatParser::PDFParser::Tokenizer.new.tokenize(File.read(at_path)) + end + describe 'with extracted objects from corpus' do fixture_paths = Dir.glob(__dir__ + '/*.pdfobj').sort fixture_paths.each do |path| it "scans #{File.basename(path)}" do - result = NuObjectParser.new.parse(File.read(path)) + result = tokenize_file_at(path) require 'pp' pp result end @@ -14,9 +21,7 @@ end it 'scans the example object from the PDF presentation' do - obj = File.read(__dir__ + '/example_a.pdfobj') - parser = NuObjectParser.new - result = parser.parse(obj) + result = tokenize_file_at(__dir__ + '/example_a.pdfobj') expect(result).to eq( [ [ @@ -42,51 +47,51 @@ end it 'scans a simple dictionary with strings and ints as values' do - result = NuObjectParser.new.parse('<>') + result = tokenize('<>') expect(result).to eq( - [[:dict, [[:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 25]]]] + [[:dict, [[:name, "/Name"], [:str, "Jim"], [:name, "/Age"], [:int, "25"]]]] ) end it 'scans a simple dictionary with arbitrary whitespace' do - result = NuObjectParser.new.parse('<< + result = tokenize('<< /Name (Jim) /Age 25>>') expect(result).to eq( - [[:dict, [[:name, '/Name'], 'Jim', [:name, '/Age'], [:int, 25]]]] + [[:dict, [[:name, "/Name"], [:str, "Jim"], [:name, "/Age"], [:int, "25"]]]] ) end it 'parses all kinds of reals' do - result = NuObjectParser.new.parse('34.5 -3.62 +123.6 4. -.002 0.0') + result = tokenize('34.5 -3.62 +123.6 4. -.002 0.0') expect(result).to eq( - [[:real, 34.5], [:real, -3.62], [:real, 123.6], [:real, 4.0], [:real, -0.002], [:real, 0.0]] + [[:real, "34.5"], [:real, "-3.62"], [:real, "+123.6"], [:real, "4."], [:real, "-.002"], [:real, "0.0"]] ) end it 'parses an array of integers' do - result = NuObjectParser.new.parse('[1 2 3 4]') + result = tokenize('[1 2 3 4]') expect(result).to eq( - [[:array, [[:int, 1], [:int, 2], [:int, 3], [:int, 4]]]] + [[:array, [[:int, "1"], [:int, "2"], [:int, "3"], [:int, "4"]]]] ) end it 'scans an array of integers with one object ref in the middle' do - result = NuObjectParser.new.parse('[1 20 00 R 3]') + result = tokenize('[1 20 00 R 3]') expect(result).to eq( - [[:array, [[:int, 1], [:ref, '20 00 R'], [:int, 3]]]] + [[:array, [[:int, "1"], [:ref, "20 00 R"], [:int, "3"]]]] ) end it 'scans an array of names' do - result = NuObjectParser.new.parse('[ /Type /Color /Medium/Rare ]') + result = tokenize('[ /Type /Color /Medium/Rare ]') expect(result).to eq( [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]] ) - result = NuObjectParser.new.parse('[/Type/Color/Medium/Rare]') + result = tokenize('[/Type/Color/Medium/Rare]') expect(result).to eq( [[:array, [[:name, '/Type'], [:name, '/Color'], [:name, '/Medium'], [:name, '/Rare']]]] ) @@ -106,72 +111,67 @@ /A#42 / ) - result = NuObjectParser.new.parse(names_str) + result = tokenize(names_str) expect(result).to eq([ - [:name, '/Name1'], - [:name, '/ASomewhatLongerName'], - [:name, '/A;Name_With-Various***Characters?'], - [:name, '/1.2'], - [:name, '/$$'], - [:name, '/@pattern'], - [:name, '/.notdef'], - [:name, '/Adobe Green'], - [:name, '/PANTONE 5757 CV'], - [:name, '/paired()parentheses'], - [:name, '/The_Key_of_F#_Minor'], - [:name, '/AB'], - [:name, '/'] + [:name, "/Name1"], + [:name, "/ASomewhatLongerName"], + [:name, "/A;Name_With-Various***Characters?"], + [:name, "/1.2"], + [:name, "/$$"], + [:name, "/@pattern"], + [:name, "/.notdef"], + [:name, "/Adobe#20Green"], + [:name, "/PANTONE#205757#20CV"], + [:name, "/paired#28#29parentheses"], + [:name, "/The_Key_of_F#23_Minor"], + [:name, "/A#42"], + [:name, "/"] ]) end it 'handles paired braces and strings escapes' do - result = NuObjectParser.new.parse(' + result = tokenize(' (Foo \\(with some bars\\)) (Foo () bar and (baz)) (Foo (with some bars)) (((()))) ') expect(result).to eq( - [ - [:str, "Foo (with some bars)"], - [:str, "Foo () bar and (baz)"], - [:str, "Foo (with some bars)"], - [:str, "((()))"] - ] + [[:str, "Foo \\(with some bars\\)"], [:str, "Foo () bar and (baz)"], [:str, "Foo (with some bars)"], [:str, "((()))"]] ) end it 'detects an unterminated string' do expect { - NuObjectParser.new.parse('(Hello there') + tokenize('(Hello there') }.to raise_error(/did not terminate/) end it 'detects an unterminated array' do expect { - NuObjectParser.new.parse('[') + tokenize('[') }.to raise_error(/did not terminate/) end it 'detects an unterminated dictionary' do expect { - NuObjectParser.new.parse('<< /Ohai') + tokenize('<< /Ohai') }.to raise_error(/did not terminate/) end it 'detects a truncated dictionary opener' do expect { - NuObjectParser.new.parse('< Date: Fri, 15 Jun 2018 13:47:33 +0200 Subject: [PATCH 14/18] Continue --- lib/parsers/pdf_parser.rb | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb index c2e61be6..06a856ac 100644 --- a/lib/parsers/pdf_parser.rb +++ b/lib/parsers/pdf_parser.rb @@ -32,7 +32,8 @@ def call(io) io.seek(xref_offset) xref_table = parse_xref_table(io) - # return unless xref_table.any? + return unless xref_table.any? + xref_table.each do |xref| io.seek(xref.offset) # From here on out we need to proceed as follows. We need to buffer (preemptively) @@ -45,14 +46,18 @@ def call(io) # Then we need to actually go in, read the object and parse the dictionary - luckily # this is not that much trouble and we can read the entire object, since it is small. # So let's get at it. - next if xref.length_limit > 1024 # Skip objects which are too large, they won't be headers anyway + next if xref.length_limit > 1024 # Skip objects which are too large, they aren't what we are looking for anyway + + # Do a quickie detection reading just a tiny piece of the object. Strictly speaking we need + # to parse the entire object (what if there are 9000 spaces between "/Type" and "/Pages" ?) + # but in practice we should be able to get away with just a few things here. + obj_header = safe_read(io, 64) + next unless obj_header.include?('/Pages') || obj_header.include?('/Catalog') - # Do a quickie detection reading just a tiny piece of the object - obj_header = safe_read(io, 32) - next unless obj_header.include?('/Type/Pages') || obj_header.include?('/Type/Catalog') io.seek(xref.offset) + # Reduce the length limit - we should read less of it if we can object_buf = io.read(xref.length_limit) - parse_pdf_object(object_buf) + extract_pdf_object_dictionary(object_buf) end raise 'nope' @@ -154,10 +159,14 @@ def max(*of_items) of_items.sort.pop end - def parse_pdf_object(str) + def extract_pdf_object_dictionary(str) token_stream = Tokenizer.new.tokenize(str) tree = Transformer.new.transform(token_stream) - $stderr.puts tree.inspect + # Locate the first hash in the parse tree + first_hash = tree.find {|e| e.is_a?(Hash) } + $stderr.puts first_hash.inspect + rescue => e + # Malformed PDF object or our parser has failed somewhere end FormatParser.register_parser self, natures: :document, formats: :pdf From b517be80e54d4fde47321312dcf3559856e346ed Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Fri, 15 Jun 2018 18:42:55 +0200 Subject: [PATCH 15/18] Strscan is required --- lib/parsers/pdf_parser/tokenizer.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/parsers/pdf_parser/tokenizer.rb b/lib/parsers/pdf_parser/tokenizer.rb index c95ee33f..63528735 100644 --- a/lib/parsers/pdf_parser/tokenizer.rb +++ b/lib/parsers/pdf_parser/tokenizer.rb @@ -1,3 +1,5 @@ +require 'strscan' + class FormatParser::PDFParser::Tokenizer Malformed = Class.new(RuntimeError) RE = ->(str) { /#{Regexp.escape(str)}/ } From fbf145e5e3554777c31e44c3034f9bc2569fd736 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Fri, 15 Jun 2018 18:43:11 +0200 Subject: [PATCH 16/18] Get rid of the special Name type we ended up not using --- lib/parsers/pdf_parser/transformer.rb | 3 --- 1 file changed, 3 deletions(-) diff --git a/lib/parsers/pdf_parser/transformer.rb b/lib/parsers/pdf_parser/transformer.rb index b25a990e..7aab792a 100644 --- a/lib/parsers/pdf_parser/transformer.rb +++ b/lib/parsers/pdf_parser/transformer.rb @@ -6,9 +6,6 @@ def self.from_ref_str(str) end end - class PDFName < Struct.new(:name) - end - # Permitted character escapes. There aren't _that_ many so we can use a table STRING_ESCAPES = { "\r" => "\n", From 12ffb7209bbbac4143a8dbb9a44517771c824a42 Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Fri, 15 Jun 2018 18:44:32 +0200 Subject: [PATCH 17/18] Patch up the parser a bit --- lib/parsers/pdf_parser.rb | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/lib/parsers/pdf_parser.rb b/lib/parsers/pdf_parser.rb index 06a856ac..9a584b7e 100644 --- a/lib/parsers/pdf_parser.rb +++ b/lib/parsers/pdf_parser.rb @@ -32,10 +32,7 @@ def call(io) io.seek(xref_offset) xref_table = parse_xref_table(io) - return unless xref_table.any? - xref_table.each do |xref| - io.seek(xref.offset) # From here on out we need to proceed as follows. We need to buffer (preemptively) # all the /Type/Pages objects for later. We also need to recover the # /Type/Catalog object which will refer us to the right /Type /Pages object to use. @@ -51,20 +48,23 @@ def call(io) # Do a quickie detection reading just a tiny piece of the object. Strictly speaking we need # to parse the entire object (what if there are 9000 spaces between "/Type" and "/Pages" ?) # but in practice we should be able to get away with just a few things here. - obj_header = safe_read(io, 64) - next unless obj_header.include?('/Pages') || obj_header.include?('/Catalog') + io.seek(xref.offset) + obj_header = io.read(64).to_s + next unless obj_header.include?('/Pages') || obj_header.include?('/Linearized') + # Seek to that object and read it whole, to the length limit or 1024 bytes whichever is lower io.seek(xref.offset) - # Reduce the length limit - we should read less of it if we can - object_buf = io.read(xref.length_limit) - extract_pdf_object_dictionary(object_buf) + object_buf = io.read(min(1024, xref.length_limit)) + dict = extract_pdf_object_dictionary(object_buf) + if dict['/Type'] == '/Pages' && dict['/Count'] + return FormatParser::Document.new(format: :pdf, page_count: dict['/Count']) + elsif dict['/Linearized'] && dict['/N'] + return FormatParser::Document.new(format: :pdf, page_count: dict['/N']) + end end - raise 'nope' - FormatParser::Document.new( - format: :pdf, - page_count: attributes[:page_count] - ) + # We could not determine page count + FormatParser::Document.new(format: :pdf) end def locate_xref_table_offset(io) @@ -140,9 +140,8 @@ def pairwise(enum) def read_until_delimiter(io, delimiter:, char_limit: 32) buf = StringIO.new(''.b) char_limit.times do - char = safe_read(io, 1).force_encoding(Encoding::BINARY) - buf << char - break if buf.string.end_with?(delimiter) + buf << safe_read(io, 1).force_encoding(Encoding::BINARY) + break if buf.string.end_with?(delimiter) || buf.string.bytesize >= char_limit end buf.string.strip end @@ -162,11 +161,13 @@ def max(*of_items) def extract_pdf_object_dictionary(str) token_stream = Tokenizer.new.tokenize(str) tree = Transformer.new.transform(token_stream) - # Locate the first hash in the parse tree + # Locate the first hash (dictionary) in the parse tree first_hash = tree.find {|e| e.is_a?(Hash) } - $stderr.puts first_hash.inspect + first_hash || {} rescue => e + $stderr.puts e # Malformed PDF object or our parser has failed somewhere + {} end FormatParser.register_parser self, natures: :document, formats: :pdf From dc9e2a3fe289e7d00ebb4a7043e9f329f3d7822c Mon Sep 17 00:00:00 2001 From: Julik Tarkhanov Date: Sat, 16 Jun 2018 00:40:14 +0200 Subject: [PATCH 18/18] Make spec titles a bit neater --- spec/parsers/pdf_parser_spec.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spec/parsers/pdf_parser_spec.rb b/spec/parsers/pdf_parser_spec.rb index 2a99f590..33fe477e 100644 --- a/spec/parsers/pdf_parser_spec.rb +++ b/spec/parsers/pdf_parser_spec.rb @@ -13,13 +13,14 @@ shared_examples :behave_like_pdf do |hash| let(:pdf_file) { hash.fetch(:file) } - it 'acts as a pdf' do + it 'is recognized as PDF' do expect(parsed_pdf).not_to be_nil expect(parsed_pdf.nature).to eq(:document) expect(parsed_pdf.format).to eq(:pdf) end it 'has a correct page count' do + expect(parsed_pdf).not_to be_nil expect(parsed_pdf.page_count).to eq(hash.fetch(:page_count)) end end