Class: Rpdf2txt::PageLeaf
- Defined in:
- lib/rpdf2txt/object.rb
Instance Attribute Summary collapse
-
#contents ⇒ Object
readonly
Returns the value of attribute contents.
-
#resources ⇒ Object
readonly
Returns the value of attribute resources.
Attributes inherited from TreeNode
Attributes inherited from PdfObject
#attributes, #decoder, #oid, #src
Instance Method Summary collapse
- #build_tree(object_catalogue, parent = nil) ⇒ Object
- #font(key) ⇒ Object
-
#initialize(*args) ⇒ PageLeaf
constructor
A new instance of PageLeaf.
- #media_box ⇒ Object
- #merge_snippets(text_snippets) ⇒ Object
- #text(callback_handler) ⇒ Object
Methods inherited from TreeNode
Methods inherited from PdfObject
#_parse_attributes, #catalogue_object, #decoded_stream, #extract_attribute_stream, #parse_attributes, #revision_id
Constructor Details
#initialize(*args) ⇒ PageLeaf
Returns a new instance of PageLeaf.
505 506 507 508 |
# File 'lib/rpdf2txt/object.rb', line 505 def initialize(*args) super @text_state = TextState.new(@target_encoding) end |
Instance Attribute Details
#contents ⇒ Object (readonly)
Returns the value of attribute contents.
504 505 506 |
# File 'lib/rpdf2txt/object.rb', line 504 def contents @contents end |
#resources ⇒ Object (readonly)
Returns the value of attribute resources.
504 505 506 |
# File 'lib/rpdf2txt/object.rb', line 504 def resources @resources end |
Instance Method Details
#build_tree(object_catalogue, parent = nil) ⇒ Object
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 |
# File 'lib/rpdf2txt/object.rb', line 509 def build_tree(object_catalogue, parent=nil) @contents=[] extract_oids(@attributes[:contents]).each{ |id| content = object_catalogue[id] @contents.push(content) content.build_tree(object_catalogue, self) if content.respond_to?(:build_tree) } resources = @attributes[:resources] if(resources.is_a? String) @resources = object_catalogue[extract_oids([resources]).first] elsif resources.is_a? Hash @resources = Resource.new(resources) elsif(resources.nil? && @parent) @resources = @parent.resources else @resources = Resource.new() end @resources.build_tree(object_catalogue) if @resources.is_a? Resource super end |
#font(key) ⇒ Object
529 530 531 |
# File 'lib/rpdf2txt/object.rb', line 529 def font(key) @resources.font(key) end |
#media_box ⇒ Object
532 533 534 535 536 |
# File 'lib/rpdf2txt/object.rb', line 532 def media_box if(parent) parent.media_box end end |
#merge_snippets(text_snippets) ⇒ Object
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 |
# File 'lib/rpdf2txt/object.rb', line 537 def merge_snippets(text_snippets) # this is required for the pdf file that is written by pdfFactory 3.25 # (Windows Server 2003 R2 Standard Edition German) # This builds up a meaningful snippet from the small snippets whose # x, y positions are same # See in more detail: # * http://dev.ywesee.com/wiki.php/Masa/20110516-trace-rpdf2txt # * http://dev.ywesee.com/wiki.php/Masa/20110517-update-rpdf2txt new_text_snippets = [] last = nil snippet = nil text_snippets.each do |snip| snippet ||= snip.txt if last if last == snip snippet << snip.txt else last.txt = snippet new_text_snippets << last.dup snippet = snip.txt last = snip end end last = snip end # for the last element lasttxt = snippet new_text_snippets << last.dup return new_text_snippets end |
#text(callback_handler) ⇒ Object
567 568 569 570 571 572 573 574 575 576 577 578 579 580 |
# File 'lib/rpdf2txt/object.rb', line 567 def text(callback_handler) concat_stream = Stream.new('') if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray)) @contents.first.build_stream(concat_stream) else @contents.each { |stream| concat_stream.append(stream.decoded_stream) } end @text_state.media_box = self.media_box text_snippets = concat_stream.extract_text_objects(self, @text_state) text_snippets = merge_snippets(text_snippets) join_snippets(text_snippets, callback_handler) end |