Class: AnyStyle::Document

Inherits:
Wapiti::Sequence
  • Object
show all
Extended by:
PDFUtils
Includes:
StringUtils
Defined in:
lib/anystyle/document.rb

Constant Summary collapse

REFSECT =
/references|referenzen|cited|bibliogra|secondary sources|literatur/i

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from PDFUtils

pdf_info, pdf_page_size, pdf_to_text

Methods included from StringUtils

canonize, count, display_chars, display_width, indent, nnum, page_break?, scrub, strip_html, transliterate

Instance Attribute Details

#infoObject

Returns the value of attribute info.



48
49
50
# File 'lib/anystyle/document.rb', line 48

def info
  @info
end

#metaObject

Returns the value of attribute meta.



48
49
50
# File 'lib/anystyle/document.rb', line 48

def meta
  @meta
end

#pagesObject

Returns the value of attribute pages.



48
49
50
# File 'lib/anystyle/document.rb', line 48

def pages
  @pages
end

#pathObject

Returns the value of attribute path.



48
49
50
# File 'lib/anystyle/document.rb', line 48

def path
  @path
end

#tokensObject Also known as: lines

Returns the value of attribute tokens.



48
49
50
# File 'lib/anystyle/document.rb', line 48

def tokens
  @tokens
end

Class Method Details

.open(path, format: File.extname(path), tagged: false, **opts) ⇒ Object

Raises:

  • (ArgumentError)


20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/anystyle/document.rb', line 20

def open(path, format: File.extname(path), tagged: false, **opts)
  raise ArgumentError,
    "document not found: '#{path}'" unless File.exist?(path)

  path = File.absolute_path(path)

  case format.downcase
  when '.pdf'
    meta = pdf_meta path, **opts if opts[:parse_meta]
    info = pdf_info path, **opts if opts[:parse_info]
    input = pdf_to_text path, **opts
  when '.ttx'
    tagged = true
    input = File.read(path, encoding: 'utf-8')
  when '.txt'
    input = File.read(path, encoding: 'utf-8')
  end

  doc = parse input, tagged: tagged
  doc.path = path
  doc.meta = meta
  doc.info = info
  doc
end

.parse(string, delimiter: /\r?\n/, tagged: false) ⇒ Object



9
10
11
12
13
14
15
16
17
18
# File 'lib/anystyle/document.rb', line 9

def parse(string, delimiter: /\r?\n/, tagged: false)
  current_label = ''
  new(string.split(delimiter).map { |line|
    if tagged
      label, line = line.split(/\s*\|(?: |$)/, 2)
      current_label = label unless label.empty?
    end
    Wapiti::Token.new line, label: current_label.to_s
  })
end

Instance Method Details

#eachObject



63
64
65
66
67
68
69
70
71
72
73
74
# File 'lib/anystyle/document.rb', line 63

def each
  if block_given?
    pages.each.with_index do |page, pn|
      page.lines.each.with_index do |line, ln|
        yield line, ln, page, pn
      end
    end
    self
  else
    to_enum
  end
end

#each_section(skip: ['meta']) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/anystyle/document.rb', line 76

def each_section(skip: ['meta'])
  if block_given?
    head = []
    body = []
    seen_content = false

    lines.each do |ln|
      case ln.label
      when 'title'
        if seen_content
          yield [head, body]
          head, body, seen_content = [ln], [], false
        else
          head << ln
        end
      when 'ref', 'text'
        body << ln
        seen_content = true
      else
        body << ln unless skip.include?(ln.label)
      end
    end
    unless head.empty?
      yield [head, body]
    end
    self
  else
    to_enum :each_section
  end
end

#include_references?(rc, tc) ⇒ Boolean

Returns:

  • (Boolean)


167
168
169
# File 'lib/anystyle/document.rb', line 167

def include_references?(rc, tc)
  rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
end

#inspectObject



192
193
194
# File 'lib/anystyle/document.rb', line 192

def inspect
  "#<AnyStyle::Document lines={#{size}}>"
end

#label(other) ⇒ Object



107
108
109
110
111
112
113
114
115
116
# File 'lib/anystyle/document.rb', line 107

def label(other)
  doc = dup
  doc.tokens = lines.map.with_index { |line, idx|
    Wapiti::Token.new line.value,
      label: other[idx].label.to_s,
      observations: other[idx].observations.dup,
      score: other[idx].score
  }
  doc
end

#line_countsObject



51
52
53
# File 'lib/anystyle/document.rb', line 51

def line_counts
  @line_counts ||= Hash.new(0)
end

#nnum_countsObject



55
56
57
# File 'lib/anystyle/document.rb', line 55

def nnum_counts
  @nnum_counts ||= Hash.new(0)
end

#references(normalize_blocks: false, **opts) ⇒ Object



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/anystyle/document.rb', line 145

def references(normalize_blocks: false, **opts)
  if normalize_blocks
    each_section.inject([]) do |refs, (head, body)|
      rc = body.count { |tk| tk.label == 'ref' }
      unless rc == 0
        tc = body.count { |tk| tk.label == 'text' }
        is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?

        # Skip sections with few ref lines!
        if is_ref_sect || include_references?(rc, tc)
          Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
          refs.concat Refs.parse(body).to_a
        end
      end

      refs
    end
  else
    Refs.parse(lines).to_a
  end
end

#sections(delimiter: "\n", spacer: ' ', **opts) ⇒ Object



171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/anystyle/document.rb', line 171

def sections(delimiter: "\n", spacer: ' ', **opts)
  each_section.map do |(head, body)|
    {
      title: head.map { |tk|
        display_chars(tk.value).lstrip.unicode_normalize
      }.join(spacer),
      text: body.map { |tk|
        display_chars(tk.value).unicode_normalize
      }.join(delimiter)
    }
  end
end

#title(delimiter: " ", **opts) ⇒ Object



184
185
186
187
188
189
190
# File 'lib/anystyle/document.rb', line 184

def title(delimiter: " ", **opts)
  lines.drop_while { |ln|
    ln.label != 'title'
  }.take_while { |ln|
    ln.label == 'title'
  }.map(&:value).join(delimiter)
end

#to_a(encode: true, **opts) ⇒ Object



131
132
133
# File 'lib/anystyle/document.rb', line 131

def to_a(encode: true, **opts)
  super(encode: encode, **opts)
end

#to_h(**opts) ⇒ Object



135
136
137
138
139
140
141
142
143
# File 'lib/anystyle/document.rb', line 135

def to_h(**opts)
  {
    info: info,
    meta: meta,
    sections: sections(**opts),
    title: title(**opts),
    references: references(**opts)
  }
end

#to_s(delimiter: "\n", encode: false, tagged: false, **opts) ⇒ Object



118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/anystyle/document.rb', line 118

def to_s(delimiter: "\n", encode: false, tagged: false, **opts)
  if tagged
    prev_label = nil
    lines.map { |ln|
      label = (ln.label == prev_label) ? '' : ln.label
      prev_label = ln.label
      '%.14s| %s' % ["#{label}              ", ln.value]
    }.join(delimiter)
  else
    super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts)
  end
end