Module: Excite::Preprocessor
- Included in:
- CRFParser
- Defined in:
- lib/excite/preprocessor.rb
Constant Summary collapse
- MARKER_TYPES =
{ :SQUARE => '\\[.+?\\]', :PAREN => '\\(.+?\\)', :NAKEDNUM => '\\d+', :NAKEDNUMDOT => '\\d+\\.', }
- CLEANUP_RULES_FILE =
"#{File.dirname(__FILE__)}/../../config/citation_cleanup_rules.yml"
Instance Method Summary collapse
- #cleanup_rules ⇒ Object
- #normalize_citation(cite) ⇒ Object
-
#normalize_cite_text(cite_text) ⇒ Object
Removes lines that appear to be junk from the citation text, and applies cleanup regexes from the configuration file.
-
#segment_citations(cite_text) ⇒ Object
Controls the process by which citations are segmented, based on the result of trying to guess the type of citation marker used in the reference section.
-
#split_citations_by_marker(cite_text, marker_type = nil) ⇒ Object
Segments citations that have explicit markers in the reference section.
Instance Method Details
#cleanup_rules ⇒ Object
16 17 18 19 20 21 22 23 24 25 |
# File 'lib/excite/preprocessor.rb', line 16 def cleanup_rules return @rules if @rules raw = YAML.load_file CLEANUP_RULES_FILE @rules = raw['order'].map do |rule_name| re = Regexp.new(raw['rules'][rule_name]['regex'], raw['rules'][rule_name]['ignore_case']) repl = raw['rules'][rule_name]['replacement_str'] || '' { re: re, repl: repl } end end |
#normalize_citation(cite) ⇒ Object
39 40 41 42 43 44 45 46 47 |
# File 'lib/excite/preprocessor.rb', line 39 def normalize_citation(cite) cite = cite.dup cleanup_rules.each do |rule| cite.gsub!(rule[:re], rule[:repl]) end cite end |
#normalize_cite_text(cite_text) ⇒ Object
Removes lines that appear to be junk from the citation text, and applies cleanup regexes from the configuration file.
31 32 33 34 35 36 37 |
# File 'lib/excite/preprocessor.rb', line 31 def normalize_cite_text(cite_text) cite_text.split(/\n/).reject do |line| line.blank? || line =~ /^[\s\d]*$/ end.map do |line| normalize_citation(line) end.join("\n") end |
#segment_citations(cite_text) ⇒ Object
Controls the process by which citations are segmented, based on the result of trying to guess the type of citation marker used in the reference section. Returns a reference to a list of citation objects.
55 56 57 58 59 60 61 62 63 |
# File 'lib/excite/preprocessor.rb', line 55 def segment_citations(cite_text) marker_type = guess_marker_type(cite_text) unless marker_type == 'UNKNOWN' citations = split_unmarked_citations(cite_text) else citations = split_citations_by_marker(cite_text, marker_type) end return citations end |
#split_citations_by_marker(cite_text, marker_type = nil) ⇒ Object
Segments citations that have explicit markers in the reference section. Whenever a new line starts with an expression that matches what we’d expect of a marker, a new citation is started. Returns a reference to a list of citation objects.
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/excite/preprocessor.rb', line 72 def split_citations_by_marker(cite_text, marker_type=nil) citations = [] current_citation = Citation.new current_citation_string = nil cite_text.split(/\n/).each {|line| if line =~ /^\s*(#{MARKER_TYPES{marker_type}})\s*(.*)$/ marker, cite_string = $1, $2 if current_citation_string current_citation.citation_string = current_citation_string citations << current_citation current_citation_string = nil end current_citation = Citation.new current_citation.marker_type = marker_type current_citation.marker = marker current_citation_string = cite_string else if current_citation_string =~ /\s\-$/ current_citation_string.sub(/\-$/, '') current_citation_string << line else current_citation_string << " " << line end end } if current_citation && current_citation_string current_citation.string = current_citation_string citations << current_citation end citations end |