Module: Excite::TokenFeatures
- Included in:
- CRFParser
- Defined in:
- lib/excite/token_features.rb
Defined Under Namespace
Modules: DictFlags
Constant Summary collapse
- DIR =
File.dirname(__FILE__)
- DICT =
TokenFeatures.read_dict_files("#{DIR}/resources/dicts")
- NODE_TYPES_BY_NAME =
{ 'div'=>'div', 'p'=>'p', 'ul'=>'div', # lump with div - higher-level structure 'li'=>'li', 'tr'=>'div', # lump with div - higher-level structure 'td'=>'td', 'span'=>'span', 'font'=>'span', 'em'=>'em', 'i'=>'em', 'strong'=>'strong', 'b'=>'strong', 'u'=>'u', 'h1'=>'h', 'h2'=>'h', 'h3'=>'h', 'h4'=>'h', 'h5'=>'h', 'h6'=>'h', 'a'=>'a', 'br'=>'br', '#document-fragment'=>'unknown' # the actual tag wasn't captured in the fragment we're parsing }
Instance Method Summary collapse
- #capitalization(toks, idx, author_names = nil) ⇒ Object
- #clear ⇒ Object
- #dict_status(toks, idx, author_names = nil) ⇒ Object
- #first_1_char(toks, idx, author_names = nil) ⇒ Object
- #first_2_chars(toks, idx, author_names = nil) ⇒ Object
- #first_3_chars(toks, idx, author_names = nil) ⇒ Object
- #first_4_chars(toks, idx, author_names = nil) ⇒ Object
- #first_5_chars(toks, idx, author_names = nil) ⇒ Object
- #firstName(toks, idx, author_names = nil) ⇒ Object
-
#is_in(toks, idx, author_names = nil) ⇒ Object
TODO remove duplication with possible_chapter.
- #last_1_char(toks, idx, author_names = nil) ⇒ Object
- #last_2_chars(toks, idx, author_names = nil) ⇒ Object
- #last_3_chars(toks, idx, author_names = nil) ⇒ Object
- #last_4_chars(toks, idx, author_names = nil) ⇒ Object
- #last_char(toks, idx, author_names = nil) ⇒ Object
- #lastName(toks, idx, author_names = nil) ⇒ Object
- #location(toks, idx, author_names = nil) ⇒ Object
- #location_in_node(toks, idx, author_names = nil) ⇒ Object
- #monthName(toks, idx, author_names = nil) ⇒ Object
- #numbers(toks, idx, author_names = nil) ⇒ Object
- #part_of_speech(toks, idx, author_names = nil) ⇒ Object
- #placeName(toks, idx, author_names = nil) ⇒ Object
-
#possible_chapter(toks, idx = nil, author_names = nil) ⇒ Object
if there is possible editor entry and “IN” preceeded by punctuation this citation may be a book chapter.
-
#possible_editor(toks, idx = nil, author_names = nil) ⇒ Object
ignores idx.
- #possible_volume(toks, idx, author_names = nil) ⇒ Object
- #publisherName(toks, idx, author_names = nil) ⇒ Object
- #punct(toks, idx, author_names = nil) ⇒ Object
- #tag_name(toks, idx, author_names = nil) ⇒ Object
- #toklcnp(toks, idx, author_names = nil) ⇒ Object
Instance Method Details
#capitalization(toks, idx, author_names = nil) ⇒ Object
86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/excite/token_features.rb', line 86 def capitalization(toks, idx, =nil) case toks[idx].np when /^[[:upper:]]$/ "singleCap" when /^[[:upper:]][[:lower:]]+/ "InitCap" when /^[[:upper:]]+$/ "AllCap" else "others" end end |
#clear ⇒ Object
47 48 49 50 51 52 |
# File 'lib/excite/token_features.rb', line 47 def clear @possible_editor = nil @possible_chapter = nil @dict_status = nil @is_proceeding = nil end |
#dict_status(toks, idx, author_names = nil) ⇒ Object
203 204 205 206 |
# File 'lib/excite/token_features.rb', line 203 def dict_status(toks, idx, =nil) @dict_status ||= [nil]*toks.length @dict_status[idx] ||= (DICT[toks[idx].lcnp] || DICT[toks[idx].raw.downcase] || 0) end |
#first_1_char(toks, idx, author_names = nil) ⇒ Object
67 |
# File 'lib/excite/token_features.rb', line 67 def first_1_char(toks, idx, =nil); toks[idx].raw[0,1]; end |
#first_2_chars(toks, idx, author_names = nil) ⇒ Object
68 |
# File 'lib/excite/token_features.rb', line 68 def first_2_chars(toks, idx, =nil); toks[idx].raw[0,2]; end |
#first_3_chars(toks, idx, author_names = nil) ⇒ Object
69 |
# File 'lib/excite/token_features.rb', line 69 def first_3_chars(toks, idx, =nil); toks[idx].raw[0,3]; end |
#first_4_chars(toks, idx, author_names = nil) ⇒ Object
70 |
# File 'lib/excite/token_features.rb', line 70 def first_4_chars(toks, idx, =nil); toks[idx].raw[0,4]; end |
#first_5_chars(toks, idx, author_names = nil) ⇒ Object
71 |
# File 'lib/excite/token_features.rb', line 71 def first_5_chars(toks, idx, =nil); toks[idx].raw[0,5]; end |
#firstName(toks, idx, author_names = nil) ⇒ Object
198 199 200 201 |
# File 'lib/excite/token_features.rb', line 198 def firstName(toks, idx, =nil) return 'firstName' if && .first == toks[idx].lcnp (dict_status(toks, idx) & DictFlags::FIRST_NAME) > 0 ? 'firstName' : 'noFirstName' end |
#is_in(toks, idx, author_names = nil) ⇒ Object
TODO remove duplication with possible_chapter
143 144 145 146 147 148 149 150 |
# File 'lib/excite/token_features.rb', line 143 def is_in(toks, idx, =nil) is_in = if idx > 0 && idx < (toks.length-1) && toks[idx].lcnp == 'in' prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[idx-1].part_of_speech) next_is_separator = ['ppl','ppc','pps'].include?(toks[idx+1].part_of_speech) prev_is_separator && (next_is_separator || toks[idx+1].np =~ /^[[:upper:]]/) end is_in ? "inBook" : "notInBook" end |
#last_1_char(toks, idx, author_names = nil) ⇒ Object
73 |
# File 'lib/excite/token_features.rb', line 73 def last_1_char(toks, idx, =nil); toks[idx].raw[-1,1]; end |
#last_2_chars(toks, idx, author_names = nil) ⇒ Object
74 |
# File 'lib/excite/token_features.rb', line 74 def last_2_chars(toks, idx, =nil); toks[idx].raw[-2,2] || toks[idx].raw; end |
#last_3_chars(toks, idx, author_names = nil) ⇒ Object
75 |
# File 'lib/excite/token_features.rb', line 75 def last_3_chars(toks, idx, =nil); toks[idx].raw[-3,3] || toks[idx].raw; end |
#last_4_chars(toks, idx, author_names = nil) ⇒ Object
76 |
# File 'lib/excite/token_features.rb', line 76 def last_4_chars(toks, idx, =nil); toks[idx].raw[-4,4] || toks[idx].raw; end |
#last_char(toks, idx, author_names = nil) ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/excite/token_features.rb', line 54 def last_char(toks, idx, =nil) case toks[idx].raw[-1,1] when /[[:lower:]]/ 'a' when /[[:upper:]]/ 'A' when /[0-9]/ 0 else toks[idx].raw[-1,1] end end |
#lastName(toks, idx, author_names = nil) ⇒ Object
193 194 195 196 |
# File 'lib/excite/token_features.rb', line 193 def lastName(toks, idx, =nil) return 'lastName' if && .last == toks[idx].lcnp (dict_status(toks, idx) & DictFlags::LAST_NAME) > 0 ? 'lastName' : 'noLastName' end |
#location(toks, idx, author_names = nil) ⇒ Object
152 153 154 |
# File 'lib/excite/token_features.rb', line 152 def location(toks, idx, =nil) r = ((idx.to_f / toks.length) * 10).round end |
#location_in_node(toks, idx, author_names = nil) ⇒ Object
239 240 241 |
# File 'lib/excite/token_features.rb', line 239 def location_in_node(toks, idx, =nil) ((toks[idx].idx_in_node.to_f / toks[idx].node_token_count) * 10).round end |
#monthName(toks, idx, author_names = nil) ⇒ Object
189 190 191 |
# File 'lib/excite/token_features.rb', line 189 def monthName(toks, idx, =nil) (dict_status(toks, idx) & DictFlags::MONTH_NAME) > 0 ? 'monthName' : 'noMonthName' end |
#numbers(toks, idx, author_names = nil) ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/excite/token_features.rb', line 99 def numbers(toks, idx, =nil) (toks[idx].raw =~ /[0-9]\-[0-9]/) ? "possiblePage" : (toks[idx].raw =~ /^\D*(19|20)[0-9][0-9]\D*$/) ? "year" : (toks[idx].np =~ /^(19|20)[0-9][0-9]$/) ? "year" : (toks[idx].np =~ /^[0-9]$/) ? "1dig" : (toks[idx].np =~ /^[0-9][0-9]$/) ? "2dig" : (toks[idx].np =~ /^[0-9][0-9][0-9]$/) ? "3dig" : (toks[idx].np =~ /^[0-9]+$/) ? "4+dig" : (toks[idx].np =~ /^[0-9]+(th|st|nd|rd)$/) ? "ordinal" : (toks[idx].np =~ /[0-9]/) ? "hasDig" : "nonNum" end |
#part_of_speech(toks, idx, author_names = nil) ⇒ Object
243 244 245 |
# File 'lib/excite/token_features.rb', line 243 def part_of_speech(toks, idx, =nil) toks[idx].part_of_speech end |
#placeName(toks, idx, author_names = nil) ⇒ Object
185 186 187 |
# File 'lib/excite/token_features.rb', line 185 def placeName(toks, idx, =nil) (dict_status(toks, idx) & DictFlags::PLACE_NAME) > 0 ? 'placeName' : 'noPlaceName' end |
#possible_chapter(toks, idx = nil, author_names = nil) ⇒ Object
if there is possible editor entry and “IN” preceeded by punctuation this citation may be a book chapter
ignores idx
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# File 'lib/excite/token_features.rb', line 126 def possible_chapter(toks, idx=nil, =nil) if !@possible_chapter.nil? @possible_chapter else has_editor = possible_editor(toks) == 'possibleEditors' has_chapter = toks.each_with_index.any? do |t, i| if i > 0 && i < (toks.length-1) && t.lcnp == 'in' prev_is_separator = ['pp','ppr','ppc','pps'].include?(toks[i-1].part_of_speech) next_is_separator = ['ppl','ppc','pps'].include?(toks[i+1].part_of_speech) prev_is_separator && (has_editor || next_is_separator) end end has_chapter ? "possibleChapter" : "noChapter" end end |
#possible_editor(toks, idx = nil, author_names = nil) ⇒ Object
ignores idx
112 113 114 115 116 117 118 119 120 |
# File 'lib/excite/token_features.rb', line 112 def possible_editor(toks, idx=nil, =nil) if !@possible_editor.nil? @possible_editor else @possible_editor = (toks.any? { |t| %w(ed editor editors eds edited).include?(t.lcnp) } ? "possibleEditors" : "noEditors") end end |
#possible_volume(toks, idx, author_names = nil) ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
# File 'lib/excite/token_features.rb', line 163 def possible_volume(toks, idx, =nil) if possible_vol_with_str(toks, idx) 'volume' elsif possible_vol_with_str(toks, idx-1) && possible_issue_with_str(toks, idx) 'issue' elsif possible_vol_with_str(toks, idx-2) && possible_issue_with_str(toks, idx-1) && possible_issue_with_str(toks, idx) 'issue' elsif possible_vol_with_parens(toks, idx) 'volume' elsif (1..3).any? { |i| possible_vol_with_parens(toks, idx-i) } 'issue' elsif possible_vol_with_colon(toks, idx) 'volume' else 'noVolume' end end |
#publisherName(toks, idx, author_names = nil) ⇒ Object
181 182 183 |
# File 'lib/excite/token_features.rb', line 181 def publisherName(toks, idx, =nil) (dict_status(toks, idx) & DictFlags::PUBLISHER_NAME) > 0 ? 'publisherName' : 'noPublisherName' end |
#punct(toks, idx, author_names = nil) ⇒ Object
156 157 158 159 160 161 |
# File 'lib/excite/token_features.rb', line 156 def punct(toks, idx, =nil) (toks[idx].raw =~ /\-.*\-/) ? "multiHyphen" : (toks[idx].raw =~ /[[:alpha:]].*\-$/) ? "truncated" : (toks[idx].raw =~ /[[:alpha:]].*\.$/) ? "abbrev" : (toks[idx].np != toks[idx].raw) ? "hasPunct" : "others" end |
#tag_name(toks, idx, author_names = nil) ⇒ Object
233 234 235 236 237 |
# File 'lib/excite/token_features.rb', line 233 def tag_name(toks, idx, =nil) node = toks[idx].node name = node.text? ? node.parent.name : node.name NODE_TYPES_BY_NAME[name.downcase] || 'other' end |
#toklcnp(toks, idx, author_names = nil) ⇒ Object
78 79 80 81 82 83 84 |
# File 'lib/excite/token_features.rb', line 78 def toklcnp(toks, idx, =nil) if toks[idx].lcnp.blank? "EMPTY" else toks[idx].lcnp end end |