Class: RecipeCrawler::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/recipe_crawler/crawler.rb

Overview

This is the main class to crawl recipes from a given url

1. Crawler will crawl url to find others recipes urls on the website
2. it will crawl urls founded to find other url again & again
3. it will scrape urls founded to get data

Constant Summary collapse

ALLOWED_URLS =

URL than crawler can parse

{
	cuisineaz: 'http://www.cuisineaz.com/recettes/',
	marmiton: 'http://www.marmiton.org/recettes/',
	g750: 'http://www.750g.com/'
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Crawler

Create a Crawler

Parameters:

  • url (String)

    a url a recipe to scrawl other one



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/recipe_crawler/crawler.rb', line 37

def initialize url
	@url = url
	if url_valid?
		@recipes = []
		@crawled_urls = []
		@scraped_urls = []
		@to_crawl_urls = []
		@to_crawl_urls << url
		@db = SQLite3::Database.new "results.sqlite3"
		@db.execute "CREATE TABLE IF NOT EXISTS recipes(
			Id INTEGER PRIMARY KEY, 
			title TEXT, 
			preptime INTEGER, 
			cooktime INTEGER, 
			ingredients TEXT, 
			steps TEXT, 
			image TEXT
		)"
	else
		raise ArgumentError , 'This url cannot be used'
	end
end

Instance Attribute Details

#crawled_urlsArray<String>

of url’s host

Returns:

  • (Array<String>)

    the current value of crawled_urls



22
23
24
# File 'lib/recipe_crawler/crawler.rb', line 22

def crawled_urls
  @crawled_urls
end

#dbSQLite3::Database

Sqlite database where recipe will be saved

Returns:

  • (SQLite3::Database)

    the current value of db



22
23
24
# File 'lib/recipe_crawler/crawler.rb', line 22

def db
  @db
end

#hostSymbol

of url’s host

Returns:

  • (Symbol)

    the current value of host



22
23
24
# File 'lib/recipe_crawler/crawler.rb', line 22

def host
  @host
end

#recipesArray<RecipeSraper::Recipe>

recipes fetched

Returns:

  • (Array<RecipeSraper::Recipe>)

    the current value of recipes



22
23
24
# File 'lib/recipe_crawler/crawler.rb', line 22

def recipes
  @recipes
end

#scraped_urlsArray<String>

of url’s host

Returns:

  • (Array<String>)

    the current value of scraped_urls



22
23
24
# File 'lib/recipe_crawler/crawler.rb', line 22

def scraped_urls
  @scraped_urls
end

#to_crawl_urlsArray<String>

of url’s host

Returns:

  • (Array<String>)

    the current value of to_crawl_urls



22
23
24
# File 'lib/recipe_crawler/crawler.rb', line 22

def to_crawl_urls
  @to_crawl_urls
end

#urlString

first url parsed

Returns:

  • (String)

    the current value of url



22
23
24
# File 'lib/recipe_crawler/crawler.rb', line 22

def url
  @url
end

Instance Method Details

#crawl!(limit = 2) {|RecipeSraper::Recipe| ... } ⇒ Object

Start the crawl

Parameters:

  • limit (Integer) (defaults to: 2)

Yields:

  • (RecipeSraper::Recipe)

    as recipe scraped



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/recipe_crawler/crawler.rb', line 81

def crawl! limit=2
	# find all link on url given (and urls of theses)
	if @host == :cuisineaz
		while !@to_crawl_urls.empty?
			get_links to_crawl_urls[0]
			break if @crawled_urls.count > limit
		end

	else
		raise NotImplementedError
	end

	# scrap urls
	recipes_returned = 0
	@crawled_urls.each{ |crawled_url|
		if limit > recipes_returned
			yield scrape crawled_url
			recipes_returned += 1
		else
			break
		end
	} if block_given?
end

This method returns an undefined value.

Get recipes links from the given url

Parameters:

  • url (String)

    as url to scrape



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/recipe_crawler/crawler.rb', line 128

def get_links url
	# catch 404 error from host
	begin
		doc = Nokogiri::HTML(open(url))
		# find internal links on page
		doc.css('#tagCloud  a').each do |link|
			link = link.attr('href')
			# If link correspond to a recipe we add it to recipe to scraw
			if link.include?(ALLOWED_URLS[@host]) and !@crawled_urls.include?(url)
				@to_crawl_urls << link
			end
		end
		@to_crawl_urls.delete url
		@crawled_urls << url
		@to_crawl_urls.uniq!

	rescue OpenURI::HTTPError
		@to_crawl_urls.delete url
		warn "#{url} cannot be reached"
	end
end

#save(recipe) ⇒ Boolean

Save recipe

Parameters:

  • recipe (RecipeSraper::Recipe)

    as recipe to save

Returns:

  • (Boolean)

    as true if success



156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/recipe_crawler/crawler.rb', line 156

def save recipe
	begin
		@db.execute "INSERT INTO recipes (title, preptime, cooktime, ingredients, steps, image)
				VALUES (:title, :preptime, :cooktime, :ingredients, :steps, :image)",
				title: recipe.title,
				preptime: recipe.preptime,
				ingredients: recipe.ingredients.join("\n"),
				steps: recipe.steps.join("\n"),
				image: recipe.image

		return true
		
	rescue SQLite3::Exception => e 
			puts "Exception occurred #{e}"
			return false
	end
end

#scrape(url) ⇒ RecipeSraper::Recipe

Scrape given url param url [String] as url to scrape

Returns:

  • (RecipeSraper::Recipe)

    as recipe scraped



111
112
113
114
115
116
117
118
119
120
# File 'lib/recipe_crawler/crawler.rb', line 111

def scrape url
	recipe = RecipeSraper::Recipe.new url
	@scraped_urls << url
	@recipes << recipe
	if save recipe
		return recipe
	else
		raise SQLite3::Exception, 'accnot save recipe'
	end
end

#url_valid?Boolean

Check if the url can be parsed and set the host

Returns:

  • (Boolean)

    true if url can be parsed



65
66
67
68
69
70
71
72
73
# File 'lib/recipe_crawler/crawler.rb', line 65

def url_valid?
	ALLOWED_URLS.each do |host, url_allowed| 
		if url.include? url_allowed 
			@host = host
			return true
		end
	end
	return false
end