Class: Tychus::Parsers::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/tychus/parsers/base.rb

Direct Known Subclasses

SchemaOrgParser

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(uri) ⇒ Base

Returns a new instance of Base.



31
32
33
34
35
36
# File 'lib/tychus/parsers/base.rb', line 31

def initialize(uri)
  @uri = uri
  @recipe = Recipe.new
  @doc = Nokogiri::HTML(open(uri))
  @recipe_doc = @doc.css(self.class.root_doc)
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



7
8
9
# File 'lib/tychus/parsers/base.rb', line 7

def doc
  @doc
end

#recipeObject (readonly)

Returns the value of attribute recipe.



7
8
9
# File 'lib/tychus/parsers/base.rb', line 7

def recipe
  @recipe
end

#recipe_docObject (readonly)

Returns the value of attribute recipe_doc.



7
8
9
# File 'lib/tychus/parsers/base.rb', line 7

def recipe_doc
  @recipe_doc
end

#uriObject (readonly)

Returns the value of attribute uri.



7
8
9
# File 'lib/tychus/parsers/base.rb', line 7

def uri
  @uri
end

Class Method Details

.recipe_attributesObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/tychus/parsers/base.rb', line 9

def self.recipe_attributes
  # TODO: clear up these attributes. Are they used? Real example to
  # verify?
    # recipeType
    # photo
    # published
    # summary
    # review - see schema.org/Review
  i[
    name
    author
    description
    prep_time
    cook_time
    total_time
    recipe_yield
    ingredients
    recipe_instructions
    image
  ]
end

Instance Method Details

#clean_instructions(obj) ⇒ Object



138
139
140
# File 'lib/tychus/parsers/base.rb', line 138

def clean_instructions(obj)
  obj
end

#parseObject



38
39
40
41
42
43
44
# File 'lib/tychus/parsers/base.rb', line 38

def parse
  recipe_attributes.each do |attr|
    property_value = __send__("parse_#{attr}")
    recipe.__send__("#{attr}=", Value(property_value))
  end
  recipe
end

#parse_authorObject



46
47
48
49
# File 'lib/tychus/parsers/base.rb', line 46

def parse_author
  # is it always first?
  itemprop_node_for(:author).content
end

#parse_cook_timeObject



80
81
82
83
84
# File 'lib/tychus/parsers/base.rb', line 80

def parse_cook_time
  # is it always first?
  # leverage iso8601
  parse_duration(itemprop_node_for(:cookTime))
end

#parse_descriptionObject



51
52
53
54
# File 'lib/tychus/parsers/base.rb', line 51

def parse_description
  # is it always first?
  itemprop_node_for(:description).content
end

#parse_duration(node) ⇒ Object



109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/tychus/parsers/base.rb', line 109

def parse_duration(node)
  # Allrecipes - 'time' element
  # Foodnetwork - 'meta' element (std according to
  # Schema.org/Recipe)
  case node.name
  when "meta", "span"
    node.attr('content')
  when "time"
    node.attr('datetime')
  else
    NullObject.new
  end
end

#parse_imageObject



86
87
88
89
# File 'lib/tychus/parsers/base.rb', line 86

def parse_image
  # is it always first?
  itemprop_node_for(:image).attr('src')
end

#parse_ingredientsObject



91
92
93
94
95
96
97
98
99
100
101
# File 'lib/tychus/parsers/base.rb', line 91

def parse_ingredients
  # NOT FIRST
  recipe_doc
    .css('[itemprop="ingredients"]')
    .map do |ingredient_node|
      ingredient_node
        .element_children
        .map(&:content)
        .join(" ")
    end.reject(&:blank?)
end

#parse_nameObject



75
76
77
78
# File 'lib/tychus/parsers/base.rb', line 75

def parse_name
  # is it always first?
  itemprop_node_for(:name).content
end

#parse_prep_timeObject



103
104
105
106
107
# File 'lib/tychus/parsers/base.rb', line 103

def parse_prep_time
  # is it always first?
  # leverage iso8601
  parse_duration(itemprop_node_for(:prepTime))
end

#parse_recipe_instructionsObject



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/tychus/parsers/base.rb', line 56

def parse_recipe_instructions
  # strip empty strings, drop trailing whitespace, clean carriage returns (\r\n)
  #
  # Allrecipes: <li><span>lorem ipsum</span></li>
  # FoodNetwork: <p>lorem ipsum</p>
  # reject headers such as "Directions" and divs such as .categories for Foodnetwork recipes
  reject_regex = /^(h.|div)$/

  clean_instructions(itemprop_node_for(:recipeInstructions)
    .element_children
    .reject { |node| node.name =~ reject_regex }
    .map do |node|
      node.content
        .squeeze(" ")
        .rstrip
        .split("\r\n\s\r\n\s")
    end.flatten.reject(&:blank?))
end

#parse_recipe_yieldObject



123
124
125
126
# File 'lib/tychus/parsers/base.rb', line 123

def parse_recipe_yield
  # is it always first?
  itemprop_node_for(:recipeYield).content
end

#parse_total_timeObject



128
129
130
131
132
# File 'lib/tychus/parsers/base.rb', line 128

def parse_total_time
  # is it always first?
  # leverage iso8601
  parse_duration(itemprop_node_for(:totalTime))
end

#recipe_attributesObject



134
135
136
# File 'lib/tychus/parsers/base.rb', line 134

def recipe_attributes
  self.class.recipe_attributes
end

#Value(obj) ⇒ Object



142
143
144
145
146
147
# File 'lib/tychus/parsers/base.rb', line 142

def Value(obj)
  case obj
  when NullObject then nil
  else obj
  end
end