Module: Segment
Defined Under Namespace
Modules: RangeIndex
Class Method Summary
collapse
Instance Method Summary
collapse
collisions, #includes?, #make_relative, #overlaps, #overlaps?, #pull, #push, #range_in
Class Method Details
.align(text, parts) ⇒ Object
163
164
165
166
167
168
169
170
171
172
173
|
# File 'lib/rbbt/segment.rb', line 163
def self.align(text, parts)
pre_offset = 0
docid = text.respond_to?(:docid) ? text.docid : nil
parts.each do |part|
offset = text.index part
next if offset.nil?
Segment.setup(part, pre_offset + offset, docid)
pre_offset += offset + part.segment_length - 1
text = text[(offset + part.segment_length - 1)..-1]
end
end
|
.ascii(text, replace = nil, &block) ⇒ Object
13
14
15
16
17
|
# File 'lib/rbbt/segment/encoding.rb', line 13
def self.ascii(text, replace = nil, &block)
bad = bad_chars(text)
replace = "?" if replace.nil?
Transformed.with_transform(text, bad, replace, &block)
end
|
.bad_chars(text) ⇒ Object
3
4
5
6
7
8
9
10
11
|
# File 'lib/rbbt/segment/encoding.rb', line 3
def self.bad_chars(text)
segments = []
text.chars.each_with_index do |c,i|
if ! c.ascii_only?
segments << Segment.setup(c, :offset => i)
end
end
segments
end
|
.clean_sort(segments) ⇒ Object
110
111
112
113
114
115
116
117
118
|
# File 'lib/rbbt/segment.rb', line 110
def self.clean_sort(segments)
sorted = sort(segments).reject{|s| s.offset.nil?}
overlaps = overlaps(sorted)
overlaps.each do |s|
sorted.delete s
end
sorted
end
|
.index(*args) ⇒ Object
190
191
192
|
# File 'lib/rbbt/segment.rb', line 190
def self.index(*args)
Segment::RangeIndex.index(*args)
end
|
.overlaps(sorted_segments) ⇒ Object
98
99
100
101
102
103
104
105
106
107
108
|
# File 'lib/rbbt/segment.rb', line 98
def self.overlaps(sorted_segments)
last = nil
overlaped = []
sorted_segments.reverse.each do |segment|
overlaped << segment if (not last.nil?) and segment.range.end > last
last = segment.range.begin
end
overlaped
end
|
.relocate(segment, original, target, pad = 20) ⇒ Object
175
176
177
178
179
180
181
182
183
184
185
186
187
188
|
# File 'lib/rbbt/segment.rb', line 175
def self.relocate(segment, original, target, pad = 20)
if segment != target[segment.range]
start_pad = [pad, segment.offset].min
end_pad = [pad, original.length - segment.end].min
start = segment.offset - start_pad
eend = segment.end + end_pad
context = original[start..eend].gsub(/\s/,' ')
target = target.gsub(/\s/, ' ')
i = target.index context
raise "Context not found in original text" if i.nil?
segment.offset = i + start_pad
end
end
|
.sort(segments, inline = true) ⇒ Object
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
# File 'lib/rbbt/segment.rb', line 77
def self.sort(segments, inline = true)
if inline
segments.sort do |a,b|
case
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
0
when (a.nil? or a.offset.nil?)
-1
when (b.nil? or b.offset.nil?)
+1
when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
a.offset.to_i <=> b.offset.to_i
else
a.segment_length <=> b.segment_length
end
end
else
segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
end
end
|
.split(text, segments, skip_segments = false) ⇒ Object
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
|
# File 'lib/rbbt/segment.rb', line 120
def self.split(text, segments, skip_segments = false)
sorted_segments = clean_sort segments
chunks = []
segment_end = 0
text_offset = 0
sorted_segments.each do |segment|
return chunks if text.nil? or text.empty?
next if segment.offset.nil?
offset = segment.offset - text_offset
case
when offset < 0
next
when offset > 0
chunk = text[0..offset - 1]
Segment.setup(chunk, text_offset)
chunks << chunk
end
segment_end = offset + segment.segment_length - 1
if not skip_segments
chunk = text[offset..segment_end]
Segment.setup(chunk, text_offset + offset)
chunks << chunk
end
text_offset += segment_end + 1
text = text[segment_end + 1..-1]
end
if not text.nil? and not text.empty?
chunk = text.dup
Segment.setup(chunk, text_offset)
chunks << chunk
end
chunks
end
|
Instance Method Details
#eend ⇒ Object
Also known as:
end
57
58
59
|
# File 'lib/rbbt/segment.rb', line 57
def eend
offset.to_i + length - 1
end
|
#range ⇒ Object
63
64
65
|
# File 'lib/rbbt/segment.rb', line 63
def range
(offset.to_i..eend)
end
|
#segment_length ⇒ Object
52
53
54
|
# File 'lib/rbbt/segment.rb', line 52
def segment_length
length
end
|