Class: Rpdf2txt::PageLeaf

Inherits:
TreeNode show all
Defined in:
lib/rpdf2txt/object.rb

Instance Attribute Summary collapse

Attributes inherited from TreeNode

#parent

Attributes inherited from PdfObject

#attributes, #decoder, #oid, #src

Instance Method Summary collapse

Methods inherited from TreeNode

#each, #extract_oids, #root?

Methods inherited from PdfObject

#_parse_attributes, #catalogue_object, #decoded_stream, #extract_attribute_stream, #parse_attributes, #revision_id

Constructor Details

#initialize(*args) ⇒ PageLeaf

Returns a new instance of PageLeaf.



505
506
507
508
# File 'lib/rpdf2txt/object.rb', line 505

def initialize(*args)
	super
	@text_state = TextState.new(@target_encoding)
end

Instance Attribute Details

#contentsObject (readonly)

Returns the value of attribute contents.



504
505
506
# File 'lib/rpdf2txt/object.rb', line 504

def contents
  @contents
end

#resourcesObject (readonly)

Returns the value of attribute resources.



504
505
506
# File 'lib/rpdf2txt/object.rb', line 504

def resources
  @resources
end

Instance Method Details

#build_tree(object_catalogue, parent = nil) ⇒ Object



509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
# File 'lib/rpdf2txt/object.rb', line 509

def build_tree(object_catalogue, parent=nil)
	@contents=[]
	extract_oids(@attributes[:contents]).each{ |id|
		content = object_catalogue[id]
		@contents.push(content)
		content.build_tree(object_catalogue, self) if content.respond_to?(:build_tree)
	} 
	resources = @attributes[:resources]
	if(resources.is_a? String)
		@resources = object_catalogue[extract_oids([resources]).first]
	elsif resources.is_a? Hash
		@resources = Resource.new(resources)
	elsif(resources.nil? && @parent)
		@resources = @parent.resources
	else
		@resources = Resource.new()
	end
	@resources.build_tree(object_catalogue) if @resources.is_a? Resource
	super
end

#font(key) ⇒ Object



529
530
531
# File 'lib/rpdf2txt/object.rb', line 529

def font(key)
	@resources.font(key)
end

#media_boxObject



532
533
534
535
536
# File 'lib/rpdf2txt/object.rb', line 532

def media_box
	if(parent)
		parent.media_box
	end
end

#merge_snippets(text_snippets) ⇒ Object



537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
# File 'lib/rpdf2txt/object.rb', line 537

def merge_snippets(text_snippets)
  # this is required for the pdf file that is written by pdfFactory 3.25 
  # (Windows Server 2003 R2 Standard Edition German)
  # This builds up a meaningful snippet from the small snippets whose
  # x, y positions are same
  # See in more detail: 
  #  * http://dev.ywesee.com/wiki.php/Masa/20110516-trace-rpdf2txt
  #  * http://dev.ywesee.com/wiki.php/Masa/20110517-update-rpdf2txt
  new_text_snippets = []
  last = nil
  snippet = nil
  text_snippets.each do |snip|
    snippet ||= snip.txt
    if last
      if last == snip
        snippet << snip.txt 
      else
        last.txt = snippet
        new_text_snippets << last.dup
        snippet = snip.txt
        last   = snip
      end
    end
    last = snip
  end
  # for the last element
  lasttxt = snippet
  new_text_snippets << last.dup
  return new_text_snippets
end

#text(callback_handler) ⇒ Object



567
568
569
570
571
572
573
574
575
576
577
578
579
580
# File 'lib/rpdf2txt/object.rb', line 567

def text(callback_handler)
	concat_stream = Stream.new('')
	if(@contents.size == 1 && @contents.first.is_a?(ReferenceArray))
		@contents.first.build_stream(concat_stream)
	else
		@contents.each { |stream|
			concat_stream.append(stream.decoded_stream)
		}
	end
	@text_state.media_box = self.media_box
	text_snippets = concat_stream.extract_text_objects(self, @text_state)
    text_snippets = merge_snippets(text_snippets)
    join_snippets(text_snippets, callback_handler)
end