Class: Rpdf2txt::Stream

Inherits:

PdfObject

Object
PdfObject
Rpdf2txt::Stream

show all

Defined in:: lib/rpdf2txt/object.rb

Direct Known Subclasses

CMap, Image, ObjStream

Constant Summary collapse

BT_PATTERN =

/\bBT\b(?!(\\[()]|[^(\\])*\))/mn

ET_PATTERN =

/\bET\b(?!(\\[()]|[^(\\])*\))/mn

FAIL_PTRN =

/\((\\[()]|[^)])*\bET\b\s*$/mn

@@nontext_scan_pattern =

%r!(?:#{dm_str})|(\b[qQ]\b)|#{xobj}|\bBI\b(.*?)\bID\b(.*?)\b(EI)\b!mn

@@hr_scan_pattern =

/#{num}#{num}(\b[lm]\b)/mn

Instance Attribute Summary

Attributes inherited from PdfObject

#attributes, #decoder, #oid, #src

Instance Method Summary collapse

Methods inherited from PdfObject

#_parse_attributes, #build_tree, #catalogue_object, #extract_attribute_stream, #initialize, #parse_attributes, #revision_id

Constructor Details

This class inherits a constructor from Rpdf2txt::PdfObject

Instance Method Details

#append(decoded_stream) ⇒ `Object`



702
703
704

# File 'lib/rpdf2txt/object.rb', line 702

def append(decoded_stream)
	(@decoded_stream  ||= '') << decoded_stream
end

#decode_raw_stream ⇒ `Object`

# File 'lib/rpdf2txt/object.rb', line 816

def decode_raw_stream
	@decrypted_stream = raw_stream
	unless(@decoder.nil?)
		@decrypted_stream = @decoder.decrypt(self)
	end
	stream = @decrypted_stream
	[@attributes[:filter]].flatten.compact.each { |filter|
      begin
        stream = case filter
                 when "/FlateDecode"
                   flate_decode stream
                 when "/LZWDecode"
                   lzw_decode stream
                 else
                   raise "Unimplemented filter: #{filter}"
                 end
      rescue StandardError => err
        warn "'#{err.message}' when filtering with #{filter}"
      end
	}
	stream
end

#decoded_stream ⇒ `Object`



708
709
710

# File 'lib/rpdf2txt/object.rb', line 708

def decoded_stream
	@decoded_stream ||= decode_raw_stream
end

#decoded_stream=(decoded_stream) ⇒ `Object`



705
706
707

# File 'lib/rpdf2txt/object.rb', line 705

def decoded_stream=(decoded_stream)
	@decoded_stream = decoded_stream
end

#extract_horizontal_rules(dm_src, dmatrix, result) ⇒ `Object`

# File 'lib/rpdf2txt/object.rb', line 711

def extract_horizontal_rules(dm_src, dmatrix, result)
  last_x = 0
  last_y = 0
  dm_src.scan(@@hr_scan_pattern) { |matches|
    case matches.last.to_s[-1]
    when ?l
      x = matches[0].to_f
      y = matches[1].to_f
      if(x != last_x && y == last_y)
        hr = HorizontalRule.new(x, y, dmatrix)
        hr.current_page, hr.text_state = @page, @text_state
        result.push(hr)
      end
      last_x = x
      last_y = y
    when ?m
      last_x = matches[0].to_f
      last_y = matches[1].to_f
    end
  }
end

#extract_nontext_objects(dm_src, dmatrix, stack, result) ⇒ `Object`

# File 'lib/rpdf2txt/object.rb', line 732

def extract_nontext_objects(dm_src, dmatrix, stack, result)
  dm_src.scan(@@nontext_scan_pattern) { |matches|
    matches = matches.compact
    case matches.last
    when 'q'
      stack.push(dmatrix)
    when 'Q'
      dmatrix = stack.pop
    when 'Do'
      x, y = (txt = result.last) ? [txt.x, txt.y] : [0, 0]
      ip = ImagePlacement.new(matches[-2], x, y, dmatrix)
      ip.current_page, ip.text_state = @page, @text_state
      result.push ip
    when 'EI'
      attrs, data, _ = matches
      im = InlineImage.new attrs, data.strip
      ip = ImagePlacement.new im, 0, 0, dmatrix
      ip.current_page, ip.text_state = @page, @text_state
      result.push ip
    else
      mmatrix = Matrix[[matches[0].to_f, matches[1].to_f,0],
                       [matches[2].to_f, matches[3].to_f,0],
                       [matches[4].to_f, matches[5].to_f,1]]
      dmatrix = dmatrix * mmatrix
    end
  }
  dmatrix
end

#extract_text_objects(page, text_state) ⇒ `Object`

# File 'lib/rpdf2txt/object.rb', line 760

def extract_text_objects(page, text_state)
    @page, @text_state = page, text_state
	stack = []
	result = []
	startpoint = decoded_stream.index(BT_PATTERN)
	endpoint = decoded_stream.index(ET_PATTERN)
    while FAIL_PTRN.match(decoded_stream[0..(endpoint+2)])
      endpoint = decoded_stream.index(ET_PATTERN, endpoint.next)
    end
	unless(startpoint && endpoint && (startpoint < endpoint))
		startpoint = 0
	end
    rotation = (page && Math::PI * page.attributes[:rotate].to_f / 180) || 0
	dmatrix = Matrix[[Math.cos(rotation),Math.sin(rotation),0],
                     [Math.sin(rotation),-Math.cos(rotation),0],
                     [0,0,1]]

	dm_src = decoded_stream[0...startpoint]
	while(endpoint && startpoint)
		### pick out the bits in between Text that are relevant to 
		### text positioning (such as the device-transformation-matrix)
		### NOTE: as far as I understand, the device matrix should 
		###       not be used to position text. However it is used 
		###       by some PDF-Creators and therefore we have to include
		###       it in our calculations.
      dmatrix = extract_nontext_objects(dm_src, dmatrix, stack, result)
      extract_horizontal_rules(dm_src, dmatrix, result)
		tsrc = decoded_stream[startpoint..(endpoint+2)]
      while FAIL_PTRN.match(tsrc)
        endpoint = decoded_stream.index(ET_PATTERN, endpoint + 2) || -1
        tsrc = decoded_stream[startpoint..(endpoint+2)]
      end
		text = Text.new(tsrc, @target_encoding, dmatrix)
		text.current_page = page
		text.text_state = text_state
      result.concat text.scan
		startpoint = decoded_stream.index(BT_PATTERN, endpoint)
		if(startpoint)
			dm_src = decoded_stream[endpoint...startpoint]
			endpoint = decoded_stream.index(ET_PATTERN, startpoint)
		end
	end
	result
end

#flate_decode(data) ⇒ `Object`



838
839
840

# File 'lib/rpdf2txt/object.rb', line 838

def flate_decode(data)
  Zlib::Inflate.inflate(data)
end

#lzw_decode(data) ⇒ `Object`

# File 'lib/rpdf2txt/object.rb', line 841

def lzw_decode(data)
  require 'rpdf2txt/lzw'
  earlychange = (parm = @attributes[:decodeparms]) && parm[:earlychange]
  if length = @attributes[:length]
    data = data[0, length.to_i]
  end
  LZW.decode data, (earlychange || 1).to_i
end

#raw_stream ⇒ `Object`

# File 'lib/rpdf2txt/object.rb', line 804

def raw_stream
	#@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s
	#@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0]
          unless(@raw_stream)
            if(src_scan = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn) and !src_scan.empty?)
	    @raw_stream = src_scan[0][0]
            else
              @raw_stream = src_scan.to_s
            end
          end
          return @raw_stream
end

#to_cmap ⇒ `Object`



849
850
851

# File 'lib/rpdf2txt/object.rb', line 849

def to_cmap
	cmap = CMap.new(@src, @target_encoding)
end

Class: Rpdf2txt::Stream

Direct Known Subclasses

Constant Summary collapse

Instance Attribute Summary

Attributes inherited from PdfObject

Instance Method Summary collapse

Methods inherited from PdfObject

Constructor Details

Instance Method Details

#append(decoded_stream) ⇒ Object

#decode_raw_stream ⇒ Object

#decoded_stream ⇒ Object

#decoded_stream=(decoded_stream) ⇒ Object

#extract_horizontal_rules(dm_src, dmatrix, result) ⇒ Object

#extract_nontext_objects(dm_src, dmatrix, stack, result) ⇒ Object

#extract_text_objects(page, text_state) ⇒ Object

#flate_decode(data) ⇒ Object

#lzw_decode(data) ⇒ Object

#raw_stream ⇒ Object

#to_cmap ⇒ Object

#append(decoded_stream) ⇒ `Object`

#decode_raw_stream ⇒ `Object`

#decoded_stream ⇒ `Object`

#decoded_stream=(decoded_stream) ⇒ `Object`

#extract_horizontal_rules(dm_src, dmatrix, result) ⇒ `Object`

#extract_nontext_objects(dm_src, dmatrix, stack, result) ⇒ `Object`

#extract_text_objects(page, text_state) ⇒ `Object`

#flate_decode(data) ⇒ `Object`

#lzw_decode(data) ⇒ `Object`

#raw_stream ⇒ `Object`

#to_cmap ⇒ `Object`