Class: CorpusProcessor::Parsers::Harem

Inherits:
Object
  • Object
show all
Defined in:
lib/corpus-processor/parsers/harem.rb

Constant Summary collapse

CATEGORY_REGEX =
/
  (?<any_text>           .*?                       ){0}
  (?<entity_attributes>  \s\g<any_text>
    CATEG="\g<categories>"\g<any_text>             ){0}
  (?<entity_opening_tag> <em\g<entity_attributes>> ){0}
  (?<entity_closing_tag> <\/em>                    ){0}

  # groups of interest
  (?<inner_text>         \g<any_text>              ){0}
  (?<categories>         \g<any_text>              ){0}

  \g<entity_opening_tag>\g<inner_text>\g<entity_closing_tag>
/ix

Instance Method Summary collapse

Constructor Details

#initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input], traverser = CorpusProcessor::Traverser.new, tokenizer = CorpusProcessor::Tokenizer.new) ⇒ Harem

Returns a new instance of Harem.



18
19
20
21
22
23
24
# File 'lib/corpus-processor/parsers/harem.rb', line 18

def initialize(categories = CorpusProcessor::DEFAULT_CATEGORIES[:input],
               traverser  = CorpusProcessor::Traverser.new,
               tokenizer  = CorpusProcessor::Tokenizer.new)
  @categories = categories
  @traverser  = traverser
  @tokenizer  = tokenizer
end

Instance Method Details

#extract_category(categories) ⇒ Object



44
45
46
47
48
49
50
# File 'lib/corpus-processor/parsers/harem.rb', line 44

def extract_category(categories)
  categories
    .split("|")
    .map { |category_string| @categories[category_string] }
    .compact
    .first
end

#parse(corpus) ⇒ Object



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/corpus-processor/parsers/harem.rb', line 26

def parse(corpus)
  [].tap { |tokens|
    @traverser.traverse(@tokenizer.join_lines(corpus),
                        CATEGORY_REGEX) do |match|
      text_to_tokenize, category = case match
                                   when String
                                     [match, nil]
                                   when MatchData
                                     [
                                       match[:inner_text],
                                       extract_category(match[:categories])
                                     ]
                                   end
      tokens.push(*@tokenizer.tokenize(text_to_tokenize, category))
    end
  }
end