Class: RecipeCrawler::Crawler

Inherits:

Object

Object
RecipeCrawler::Crawler

show all

Defined in:: lib/recipe_crawler/crawler.rb

Overview

This is the main class to crawl recipes from a given url

1. Crawler will crawl url to find others recipes urls on the website
2. it will crawl urls founded to find other url again & again
3. it will scrape urls founded to get data

Constant Summary collapse

ALLOWED_URLS = URL than crawler can parse

{
	cuisineaz: 'http://www.cuisineaz.com/recettes/',
	marmiton: 'http://www.marmiton.org/recettes/',
	g750: 'http://www.750g.com/'
}

Instance Attribute Summary collapse

#crawled_urls ⇒ Array<String>

of url’s host.
#db ⇒ SQLite3::Database

Sqlite database where recipe will be saved.
#host ⇒ Symbol

of url’s host.
#recipes ⇒ Array<RecipeSraper::Recipe>

recipes fetched.
#scraped_urls ⇒ Array<String>

of url’s host.
#to_crawl_urls ⇒ Array<String>

of url’s host.
#url ⇒ String

first url parsed.

Instance Method Summary collapse

#crawl!(limit = 2) {|RecipeSraper::Recipe| ... } ⇒ Object

Start the crawl.
#get_links(url) ⇒ void

Get recipes links from the given url.
#initialize(url) ⇒ Crawler constructor

Create a Crawler.
#save(recipe) ⇒ Boolean

Save recipe.
#scrape(url) ⇒ RecipeSraper::Recipe

Scrape given url param url [String] as url to scrape.
#url_valid? ⇒ Boolean

Check if the url can be parsed and set the host.

Constructor Details

#initialize(url) ⇒ `Crawler`

Create a Crawler

Parameters:

url (String) —

a url a recipe to scrawl other one

# File 'lib/recipe_crawler/crawler.rb', line 37

def initialize url
	@url = url
	if url_valid?
		@recipes = []
		@crawled_urls = []
		@scraped_urls = []
		@to_crawl_urls = []
		@to_crawl_urls << url
		@db = SQLite3::Database.new "results.sqlite3"
		@db.execute "CREATE TABLE IF NOT EXISTS recipes(
			Id INTEGER PRIMARY KEY, 
			title TEXT, 
			preptime INTEGER, 
			cooktime INTEGER, 
			ingredients TEXT, 
			steps TEXT, 
			image TEXT
		)"
	else
		raise ArgumentError , 'This url cannot be used'
	end
end

Instance Attribute Details

#crawled_urls ⇒ `Array<String>`

of url’s host

Returns:

(Array<String>) —

the current value of crawled_urls



22
23
24

# File 'lib/recipe_crawler/crawler.rb', line 22

def crawled_urls
  @crawled_urls
end

#db ⇒ `SQLite3::Database`

Sqlite database where recipe will be saved

Returns:

(SQLite3::Database) —

the current value of db



22
23
24

# File 'lib/recipe_crawler/crawler.rb', line 22

def db
  @db
end

#host ⇒ `Symbol`

of url’s host

Returns:

(Symbol) —

the current value of host



22
23
24

# File 'lib/recipe_crawler/crawler.rb', line 22

def host
  @host
end

#recipes ⇒ `Array<RecipeSraper::Recipe>`

recipes fetched

Returns:

(Array<RecipeSraper::Recipe>) —

the current value of recipes



22
23
24

# File 'lib/recipe_crawler/crawler.rb', line 22

def recipes
  @recipes
end

#scraped_urls ⇒ `Array<String>`

of url’s host

Returns:

(Array<String>) —

the current value of scraped_urls



22
23
24

# File 'lib/recipe_crawler/crawler.rb', line 22

def scraped_urls
  @scraped_urls
end

#to_crawl_urls ⇒ `Array<String>`

of url’s host

Returns:

(Array<String>) —

the current value of to_crawl_urls



22
23
24

# File 'lib/recipe_crawler/crawler.rb', line 22

def to_crawl_urls
  @to_crawl_urls
end

#url ⇒ `String`

first url parsed

Returns:

(String) —

the current value of url



22
23
24

# File 'lib/recipe_crawler/crawler.rb', line 22

def url
  @url
end

Instance Method Details

#crawl!(limit = 2) {|RecipeSraper::Recipe| ... } ⇒ `Object`

Start the crawl

Parameters:

limit (Integer) (defaults to: 2)

Yields:

(RecipeSraper::Recipe) —

as recipe scraped

# File 'lib/recipe_crawler/crawler.rb', line 81

def crawl! limit=2
	# find all link on url given (and urls of theses)
	if @host == :cuisineaz
		while !@to_crawl_urls.empty?
			get_links to_crawl_urls[0]
			break if @crawled_urls.count > limit
		end

	else
		raise NotImplementedError
	end

	# scrap urls
	recipes_returned = 0
	@crawled_urls.each{ |crawled_url|
		if limit > recipes_returned
			yield scrape crawled_url
			recipes_returned += 1
		else
			break
		end
	} if block_given?
end

#get_links(url) ⇒ `void`

This method returns an undefined value.

Get recipes links from the given url

Parameters:

url (String) —

as url to scrape

# File 'lib/recipe_crawler/crawler.rb', line 128

def get_links url
	# catch 404 error from host
	begin
		doc = Nokogiri::HTML(open(url))
		# find internal links on page
		doc.css('#tagCloud  a').each do |link|
			link = link.attr('href')
			# If link correspond to a recipe we add it to recipe to scraw
			if link.include?(ALLOWED_URLS[@host]) and !@crawled_urls.include?(url)
				@to_crawl_urls << link
			end
		end
		@to_crawl_urls.delete url
		@crawled_urls << url
		@to_crawl_urls.uniq!

	rescue OpenURI::HTTPError
		@to_crawl_urls.delete url
		warn "#{url} cannot be reached"
	end
end

#save(recipe) ⇒ `Boolean`

Save recipe

Parameters:

recipe (RecipeSraper::Recipe) —

as recipe to save

Returns:

(Boolean) —

as true if success

# File 'lib/recipe_crawler/crawler.rb', line 156

def save recipe
	begin
		@db.execute "INSERT INTO recipes (title, preptime, cooktime, ingredients, steps, image)
				VALUES (:title, :preptime, :cooktime, :ingredients, :steps, :image)",
				title: recipe.title,
				preptime: recipe.preptime,
				ingredients: recipe.ingredients.join("\n"),
				steps: recipe.steps.join("\n"),
				image: recipe.image

		return true
		
	rescue SQLite3::Exception => e 
			puts "Exception occurred #{e}"
			return false
	end
end

#scrape(url) ⇒ `RecipeSraper::Recipe`

Scrape given url param url [String] as url to scrape

Returns:

(RecipeSraper::Recipe) —

as recipe scraped

# File 'lib/recipe_crawler/crawler.rb', line 111

def scrape url
	recipe = RecipeSraper::Recipe.new url
	@scraped_urls << url
	@recipes << recipe
	if save recipe
		return recipe
	else
		raise SQLite3::Exception, 'accnot save recipe'
	end
end

#url_valid? ⇒ `Boolean`

Check if the url can be parsed and set the host

Returns:

(Boolean) —

true if url can be parsed

# File 'lib/recipe_crawler/crawler.rb', line 65

def url_valid?
	ALLOWED_URLS.each do |host, url_allowed| 
		if url.include? url_allowed 
			@host = host
			return true
		end
	end
	return false
end

Class: RecipeCrawler::Crawler

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Crawler

Instance Attribute Details

#crawled_urls ⇒ Array<String>

#db ⇒ SQLite3::Database

#host ⇒ Symbol

#recipes ⇒ Array<RecipeSraper::Recipe>

#scraped_urls ⇒ Array<String>

#to_crawl_urls ⇒ Array<String>

#url ⇒ String

Instance Method Details

#crawl!(limit = 2) {|RecipeSraper::Recipe| ... } ⇒ Object

#get_links(url) ⇒ void

#save(recipe) ⇒ Boolean

#scrape(url) ⇒ RecipeSraper::Recipe

#url_valid? ⇒ Boolean

#initialize(url) ⇒ `Crawler`

#crawled_urls ⇒ `Array<String>`

#db ⇒ `SQLite3::Database`

#host ⇒ `Symbol`

#recipes ⇒ `Array<RecipeSraper::Recipe>`

#scraped_urls ⇒ `Array<String>`

#to_crawl_urls ⇒ `Array<String>`

#url ⇒ `String`

#crawl!(limit = 2) {|RecipeSraper::Recipe| ... } ⇒ `Object`

#get_links(url) ⇒ `void`

#save(recipe) ⇒ `Boolean`

#scrape(url) ⇒ `RecipeSraper::Recipe`

#url_valid? ⇒ `Boolean`