Class: Rpdf2txt::Stream
Constant Summary collapse
- BT_PATTERN =
/\bBT\b(?!(\\[()]|[^(\\])*\))/mn
- ET_PATTERN =
/\bET\b(?!(\\[()]|[^(\\])*\))/mn
- FAIL_PTRN =
/\((\\[()]|[^)])*\bET\b\s*$/mn
- @@nontext_scan_pattern =
%r!(?:#{dm_str})|(\b[qQ]\b)|#{xobj}|\bBI\b(.*?)\bID\b(.*?)\b(EI)\b!mn
- @@hr_scan_pattern =
/#{num}#{num}(\b[lm]\b)/mn
Instance Attribute Summary
Attributes inherited from PdfObject
#attributes, #decoder, #oid, #src
Instance Method Summary collapse
- #append(decoded_stream) ⇒ Object
- #decode_raw_stream ⇒ Object
- #decoded_stream ⇒ Object
- #decoded_stream=(decoded_stream) ⇒ Object
- #extract_horizontal_rules(dm_src, dmatrix, result) ⇒ Object
- #extract_nontext_objects(dm_src, dmatrix, stack, result) ⇒ Object
- #extract_text_objects(page, text_state) ⇒ Object
- #flate_decode(data) ⇒ Object
- #lzw_decode(data) ⇒ Object
- #raw_stream ⇒ Object
- #to_cmap ⇒ Object
Methods inherited from PdfObject
#_parse_attributes, #build_tree, #catalogue_object, #extract_attribute_stream, #initialize, #parse_attributes, #revision_id
Constructor Details
This class inherits a constructor from Rpdf2txt::PdfObject
Instance Method Details
#append(decoded_stream) ⇒ Object
702 703 704 |
# File 'lib/rpdf2txt/object.rb', line 702 def append(decoded_stream) (@decoded_stream ||= '') << decoded_stream end |
#decode_raw_stream ⇒ Object
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 |
# File 'lib/rpdf2txt/object.rb', line 816 def decode_raw_stream @decrypted_stream = raw_stream unless(@decoder.nil?) @decrypted_stream = @decoder.decrypt(self) end stream = @decrypted_stream [@attributes[:filter]].flatten.compact.each { |filter| begin stream = case filter when "/FlateDecode" flate_decode stream when "/LZWDecode" lzw_decode stream else raise "Unimplemented filter: #{filter}" end rescue StandardError => err warn "'#{err.}' when filtering with #{filter}" end } stream end |
#decoded_stream ⇒ Object
708 709 710 |
# File 'lib/rpdf2txt/object.rb', line 708 def decoded_stream @decoded_stream ||= decode_raw_stream end |
#decoded_stream=(decoded_stream) ⇒ Object
705 706 707 |
# File 'lib/rpdf2txt/object.rb', line 705 def decoded_stream=(decoded_stream) @decoded_stream = decoded_stream end |
#extract_horizontal_rules(dm_src, dmatrix, result) ⇒ Object
711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 |
# File 'lib/rpdf2txt/object.rb', line 711 def extract_horizontal_rules(dm_src, dmatrix, result) last_x = 0 last_y = 0 dm_src.scan(@@hr_scan_pattern) { |matches| case matches.last.to_s[-1] when ?l x = matches[0].to_f y = matches[1].to_f if(x != last_x && y == last_y) hr = HorizontalRule.new(x, y, dmatrix) hr.current_page, hr.text_state = @page, @text_state result.push(hr) end last_x = x last_y = y when ?m last_x = matches[0].to_f last_y = matches[1].to_f end } end |
#extract_nontext_objects(dm_src, dmatrix, stack, result) ⇒ Object
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 |
# File 'lib/rpdf2txt/object.rb', line 732 def extract_nontext_objects(dm_src, dmatrix, stack, result) dm_src.scan(@@nontext_scan_pattern) { |matches| matches = matches.compact case matches.last when 'q' stack.push(dmatrix) when 'Q' dmatrix = stack.pop when 'Do' x, y = (txt = result.last) ? [txt.x, txt.y] : [0, 0] ip = ImagePlacement.new(matches[-2], x, y, dmatrix) ip.current_page, ip.text_state = @page, @text_state result.push ip when 'EI' attrs, data, _ = matches im = InlineImage.new attrs, data.strip ip = ImagePlacement.new im, 0, 0, dmatrix ip.current_page, ip.text_state = @page, @text_state result.push ip else mmatrix = Matrix[[matches[0].to_f, matches[1].to_f,0], [matches[2].to_f, matches[3].to_f,0], [matches[4].to_f, matches[5].to_f,1]] dmatrix = dmatrix * mmatrix end } dmatrix end |
#extract_text_objects(page, text_state) ⇒ Object
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 |
# File 'lib/rpdf2txt/object.rb', line 760 def extract_text_objects(page, text_state) @page, @text_state = page, text_state stack = [] result = [] startpoint = decoded_stream.index(BT_PATTERN) endpoint = decoded_stream.index(ET_PATTERN) while FAIL_PTRN.match(decoded_stream[0..(endpoint+2)]) endpoint = decoded_stream.index(ET_PATTERN, endpoint.next) end unless(startpoint && endpoint && (startpoint < endpoint)) startpoint = 0 end rotation = (page && Math::PI * page.attributes[:rotate].to_f / 180) || 0 dmatrix = Matrix[[Math.cos(rotation),Math.sin(rotation),0], [Math.sin(rotation),-Math.cos(rotation),0], [0,0,1]] dm_src = decoded_stream[0...startpoint] while(endpoint && startpoint) ### pick out the bits in between Text that are relevant to ### text positioning (such as the device-transformation-matrix) ### NOTE: as far as I understand, the device matrix should ### not be used to position text. However it is used ### by some PDF-Creators and therefore we have to include ### it in our calculations. dmatrix = extract_nontext_objects(dm_src, dmatrix, stack, result) extract_horizontal_rules(dm_src, dmatrix, result) tsrc = decoded_stream[startpoint..(endpoint+2)] while FAIL_PTRN.match(tsrc) endpoint = decoded_stream.index(ET_PATTERN, endpoint + 2) || -1 tsrc = decoded_stream[startpoint..(endpoint+2)] end text = Text.new(tsrc, @target_encoding, dmatrix) text.current_page = page text.text_state = text_state result.concat text.scan startpoint = decoded_stream.index(BT_PATTERN, endpoint) if(startpoint) dm_src = decoded_stream[endpoint...startpoint] endpoint = decoded_stream.index(ET_PATTERN, startpoint) end end result end |
#flate_decode(data) ⇒ Object
838 839 840 |
# File 'lib/rpdf2txt/object.rb', line 838 def flate_decode(data) Zlib::Inflate.inflate(data) end |
#lzw_decode(data) ⇒ Object
841 842 843 844 845 846 847 848 |
# File 'lib/rpdf2txt/object.rb', line 841 def lzw_decode(data) require 'rpdf2txt/lzw' earlychange = (parm = @attributes[:decodeparms]) && parm[:earlychange] if length = @attributes[:length] data = data[0, length.to_i] end LZW.decode data, (earlychange || 1).to_i end |
#raw_stream ⇒ Object
804 805 806 807 808 809 810 811 812 813 814 815 |
# File 'lib/rpdf2txt/object.rb', line 804 def raw_stream #@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn).to_s #@raw_stream ||= @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn)[0][0] unless(@raw_stream) if(src_scan = @src.scan(/stream[\r\n]{1,2}(.*)endstream/mn) and !src_scan.empty?) @raw_stream = src_scan[0][0] else @raw_stream = src_scan.to_s end end return @raw_stream end |