Class: AnyStyle::Document
- Inherits:
-
Wapiti::Sequence
- Object
- Wapiti::Sequence
- AnyStyle::Document
show all
- Extended by:
- PDFUtils
- Includes:
- StringUtils
- Defined in:
- lib/anystyle/document.rb
Constant Summary
collapse
- REFSECT =
/references|referenzen|cited|bibliogra|secondary sources|literatur/i
Instance Attribute Summary collapse
Class Method Summary
collapse
Instance Method Summary
collapse
Methods included from PDFUtils
pdf_info, pdf_page_size, pdf_to_text
canonize, count, display_chars, display_width, indent, nnum, page_break?, scrub, strip_html, transliterate
Instance Attribute Details
#info ⇒ Object
Returns the value of attribute info.
50
51
52
|
# File 'lib/anystyle/document.rb', line 50
def info
@info
end
|
Returns the value of attribute meta.
50
51
52
|
# File 'lib/anystyle/document.rb', line 50
def meta
@meta
end
|
#pages ⇒ Object
Returns the value of attribute pages.
50
51
52
|
# File 'lib/anystyle/document.rb', line 50
def pages
@pages
end
|
#path ⇒ Object
Returns the value of attribute path.
50
51
52
|
# File 'lib/anystyle/document.rb', line 50
def path
@path
end
|
#tokens ⇒ Object
Also known as:
lines
Returns the value of attribute tokens.
50
51
52
|
# File 'lib/anystyle/document.rb', line 50
def tokens
@tokens
end
|
Class Method Details
.open(path, format: File.extname(path), tagged: false, **opts) ⇒ Object
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
# File 'lib/anystyle/document.rb', line 20
def open(path, format: File.extname(path), tagged: false, **opts)
raise ArgumentError,
"cannot open tainted path: '#{path}'" if path.tainted?
raise ArgumentError,
"document not found: '#{path}'" unless File.exist?(path)
path = File.absolute_path(path)
case format.downcase
when '.pdf'
meta = pdf_meta path, **opts if opts[:parse_meta]
info = pdf_info path, **opts if opts[:parse_info]
input = pdf_to_text path, **opts
when '.ttx'
tagged = true
input = File.read(path, encoding: 'utf-8')
when '.txt'
input = File.read(path, encoding: 'utf-8')
end
doc = parse input, tagged: tagged
doc.path = path
doc.meta = meta
doc.info = info
doc
end
|
.parse(string, delimiter: /\r?\n/, tagged: false) ⇒ Object
9
10
11
12
13
14
15
16
17
18
|
# File 'lib/anystyle/document.rb', line 9
def parse(string, delimiter: /\r?\n/, tagged: false)
current_label = ''
new(string.split(delimiter).map { |line|
if tagged
label, line = line.split(/\s*\| /, 2)
current_label = label unless label.empty?
end
Wapiti::Token.new line, label: current_label.to_s
})
end
|
Instance Method Details
#each ⇒ Object
65
66
67
68
69
70
71
72
73
74
75
76
|
# File 'lib/anystyle/document.rb', line 65
def each
if block_given?
pages.each.with_index do |page, pn|
page.lines.each.with_index do |line, ln|
yield line, ln, page, pn
end
end
self
else
to_enum
end
end
|
#each_section(skip: ['meta']) ⇒ Object
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
# File 'lib/anystyle/document.rb', line 78
def each_section(skip: ['meta'])
if block_given?
head = []
body = []
seen_content = false
lines.each do |ln|
case ln.label
when 'title'
if seen_content
yield [head, body]
head, body, seen_content = [ln], [], false
else
head << ln
end
when 'ref', 'text'
body << ln
seen_content = true
else
body << ln unless skip.include?(ln.label)
end
end
unless head.empty?
yield [head, body]
end
self
else
to_enum :each_section
end
end
|
#include_references?(rc, tc) ⇒ Boolean
169
170
171
|
# File 'lib/anystyle/document.rb', line 169
def include_references?(rc, tc)
rc > 10 || (rc + tc) > 20 && (rc.to_f / tc) > 0.2
end
|
#inspect ⇒ Object
194
195
196
|
# File 'lib/anystyle/document.rb', line 194
def inspect
"#<AnyStyle::Document lines={#{size}}>"
end
|
#label(other) ⇒ Object
109
110
111
112
113
114
115
116
117
118
|
# File 'lib/anystyle/document.rb', line 109
def label(other)
doc = dup
doc.tokens = lines.map.with_index { |line, idx|
Wapiti::Token.new line.value,
label: other[idx].label.to_s,
observations: other[idx].observations.dup,
score: other[idx].score
}
doc
end
|
#line_counts ⇒ Object
53
54
55
|
# File 'lib/anystyle/document.rb', line 53
def line_counts
@line_counts ||= Hash.new(0)
end
|
#nnum_counts ⇒ Object
57
58
59
|
# File 'lib/anystyle/document.rb', line 57
def nnum_counts
@nnum_counts ||= Hash.new(0)
end
|
#references(normalize_blocks: false, **opts) ⇒ Object
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
|
# File 'lib/anystyle/document.rb', line 147
def references(normalize_blocks: false, **opts)
if normalize_blocks
each_section.inject([]) do |refs, (head, body)|
rc = body.count { |tk| tk.label == 'ref' }
unless rc == 0
tc = body.count { |tk| tk.label == 'text' }
is_ref_sect = !head.find { |tk| tk.value =~ REFSECT }.nil?
if is_ref_sect || include_references?(rc, tc)
Refs.normalize! body, max_win_size: is_ref_sect ? 6 : 2
refs.concat Refs.parse(body).to_a
end
end
refs
end
else
Refs.parse(lines).to_a
end
end
|
#sections(delimiter: "\n", spacer: ' ', **opts) ⇒ Object
173
174
175
176
177
178
179
180
181
182
183
184
|
# File 'lib/anystyle/document.rb', line 173
def sections(delimiter: "\n", spacer: ' ', **opts)
each_section.map do |(head, body)|
{
title: head.map { |tk|
display_chars(tk.value).lstrip.unicode_normalize
}.join(spacer),
text: body.map { |tk|
display_chars(tk.value).unicode_normalize
}.join(delimiter)
}
end
end
|
#title(delimiter: " ", **opts) ⇒ Object
186
187
188
189
190
191
192
|
# File 'lib/anystyle/document.rb', line 186
def title(delimiter: " ", **opts)
lines.drop_while { |ln|
ln.label != 'title'
}.take_while { |ln|
ln.label == 'title'
}.map(&:value).join(delimiter)
end
|
#to_a(encode: true, **opts) ⇒ Object
133
134
135
|
# File 'lib/anystyle/document.rb', line 133
def to_a(encode: true, **opts)
super(encode: encode, **opts)
end
|
#to_h(**opts) ⇒ Object
137
138
139
140
141
142
143
144
145
|
# File 'lib/anystyle/document.rb', line 137
def to_h(**opts)
{
info: info,
meta: meta,
sections: sections(**opts),
title: title(**opts),
references: references(**opts)
}
end
|
#to_s(delimiter: "\n", encode: false, tagged: false, **opts) ⇒ Object
120
121
122
123
124
125
126
127
128
129
130
131
|
# File 'lib/anystyle/document.rb', line 120
def to_s(delimiter: "\n", encode: false, tagged: false, **opts)
if tagged
prev_label = nil
lines.map { |ln|
label = (ln.label == prev_label) ? '' : ln.label
prev_label = ln.label
'%.14s| %s' % ["#{label} ", ln.value]
}.join(delimiter)
else
super(delimiter: delimiter, encode: encode, tagged: tagged, expanded: false, **opts)
end
end
|