Class: TocExtract
- Inherits:
-
Object
- Object
- TocExtract
- Defined in:
- lib/toc_extract.rb,
lib/toc_extract/preview.rb,
lib/toc_extract/extractor.rb
Class Method Summary collapse
- .extract(pdf_file, template, toc_start_page, toc_end_page) ⇒ Object
- .fill_bounding_boxes(pdf_file, sections, toc_end_page) ⇒ Object
- .hi(language = "english") ⇒ Object
- .preview(pdf_file, section, crop_width, crop_height) ⇒ Object
- .sections_from_toc_lines(lines, template) ⇒ Object
- .toc_lines(pdf_file, template, toc_start_page, toc_end_page) ⇒ Object
Class Method Details
.extract(pdf_file, template, toc_start_page, toc_end_page) ⇒ Object
14 15 16 17 18 19 20 |
# File 'lib/toc_extract/extractor.rb', line 14 def self.extract(pdf_file, template, toc_start_page, toc_end_page) lines = TocExtract.toc_lines(pdf_file, template, toc_start_page, toc_end_page) sections = TocExtract.sections_from_toc_lines(lines, template) TocExtract.fill_bounding_boxes(pdf_file, sections, toc_end_page) sections end |
.fill_bounding_boxes(pdf_file, sections, toc_end_page) ⇒ Object
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/toc_extract/extractor.rb', line 119 def self.fill_bounding_boxes(pdf_file, sections, toc_end_page) i = 0 bboxes = {} content_parts = [] PDF::Reader.open(pdf_file) do |reader| reader.pages.each_with_index do |page, page_num| next if page_num <= toc_end_page page.extend(PDF::Reader::FindText) runs = page.runs(merge: false) runs.each do |run| content_parts << run.text bboxes[i] = { "x" => run.x, "y" => run.y, "width" => run.width, "endx" => run.endx, "endy" => run.endy, "page" => page_num } i += run.text.length end end end content = content_parts.join # For each section, search in the content and find its position, # look it up in the bounding box and store it for section in sections pos = content.index(section.title) # puts "not found #{pos} for #{section.title}" if bboxes.key?(pos) section.bounding_box = bboxes[pos] end end |
.hi(language = "english") ⇒ Object
2 3 4 5 |
# File 'lib/toc_extract.rb', line 2 def self.hi(language = "english") translator = Translator.new(language) translator.hi end |
.preview(pdf_file, section, crop_width, crop_height) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/toc_extract/preview.rb', line 2 def self.preview(pdf_file, section, crop_width, crop_height) require "rmagick" require "pdf/reader" require "pdf/reader/find_text" result = "" PDF::Reader.open(pdf_file) do |reader| page_num = section.page_number target_page = reader.pages[page_num] page_width = target_page.width page_height = target_page.height images = Magick::Image.read("#{pdf_file}[#{page_num}]") do |info| info.density = 150 end img = images.first img_width = img.columns img_height = img.rows scale_x = img_width.to_f / page_width scale_y = img_height.to_f / page_height # In pdf, x,y are the bottom left coordinates, converting them to top left # Convert pdf-reader coordinates to RMagick coordinates # pdf-reader: origin at bottom-left, Y increases upward # RMagick: origin at top-left, Y increases downward # Add some padding as well pdf_x = section.bounding_box["x"] - 30 pdf_y = section.bounding_box["endy"] + 10 img_x = (pdf_x * scale_x).round img_y = ((page_height - pdf_y) * scale_y).round # Flip Y coordinate img_width_pixels = (crop_width * scale_x).round img_height_pixels = (crop_height * scale_y).round # Ensure coordinates are within image bounds img_x = [ img_x, 0 ].max img_y = [ img_y, 0 ].max img_width_pixels = [ img_width_pixels, img_width ].min img_height_pixels = [ img_height_pixels, img_height ].min cropped_img = img.crop(img_x, img_y, img_width_pixels, img_height_pixels) cropped_img.resize_to_fit!(crop_width, crop_height) result = cropped_img.to_blob { |info| info.format = "PNG" } end result end |
.sections_from_toc_lines(lines, template) ⇒ Object
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# File 'lib/toc_extract/extractor.rb', line 80 def self.sections_from_toc_lines(lines, template) # Same as before, the template is important, but is hard-coded as <NUMBER><TEXT>[...]<NUMBER> sections = [] for line in lines section_id = "" page = "" title_start = 0 title_end = line.length # Section id line.each_char do |c| if c.match?(/[\d\.]/) section_id += c title_start += 1 else break end end # Page number line.reverse.each_char do |c| if c.match?(/[\d]/) page += c title_end -= 1 else break end end page = page.reverse.to_i - 1 # Since TOC pages are 1-based # title title = line[title_start..title_end-1].sub(/\.+$/, '') sections << Section.new(section_id, title, page) end sections end |
.toc_lines(pdf_file, template, toc_start_page, toc_end_page) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/toc_extract/extractor.rb', line 23 def self.toc_lines(pdf_file, template, toc_start_page, toc_end_page) # To detect the lines, we assume that all the elements on a line have the same y value. # If this assumption changes in the future, we can instead list all the deltas between # y values and compute breaking points where a noticeable jumpt in delta occures. # template is a placeholder for future extensions. At the moment, always assume that # each line starts with a section number, followed by section title and optionally # some dots and ends with a page number. Morever, we assume that pages use arabic # numbering as opposed to roman numerals. lines = [] max_delta = 0 line = "" last_y = 0 PDF::Reader.open(pdf_file) do |reader| reader.pages.each_with_index do |page, page_num| next if page_num < toc_start_page break if page_num > toc_end_page page.extend(PDF::Reader::FindText) runs = page.runs(merge: false) runs.each do |run| y = run.y last_y = y if last_y == 0 delta = (last_y - y).abs if delta > max_delta lines << line line = run.text last_y = y else line += run.text end end end end # Merge lines based on the template. Currently, hard-coded as <NUMBER><TEXT>[...]<NUMBER> # Also romeve the lines that contain page numbers. This should also be part of the template, assuming arabic numbering real_lines = [] for line in lines if line[0].match?(/[\d]/) next if line.scan(/\D/).empty? real_lines << line else # we could be unlucky and get a line that happens to have number in the its text portion # and the line breaks at that exact moment. Ignoring for now. if real_lines.length == 0 next # this is the toc header else real_lines[-1] += line end end end real_lines end |