Class: RecipeCrawler::Crawler
- Inherits:
-
Object
- Object
- RecipeCrawler::Crawler
- Defined in:
- lib/recipe_crawler/crawler.rb
Overview
This is the main class to crawl recipes from a given url
1. Crawler will crawl url to find others recipes urls on the website
2. it will crawl urls founded to find other url again & again
3. it will scrape urls founded to get data
Constant Summary collapse
- ALLOWED_URLS =
URL than crawler can parse
{ cuisineaz: 'http://www.cuisineaz.com/recettes/', marmiton: 'http://www.marmiton.org/recettes/', g750: 'http://www.750g.com/' }
Instance Attribute Summary collapse
-
#crawled_urls ⇒ Array<String>
of url’s host.
-
#db ⇒ SQLite3::Database
Sqlite database where recipe will be saved.
-
#host ⇒ Symbol
of url’s host.
-
#recipes ⇒ Array<RecipeSraper::Recipe>
recipes fetched.
-
#scraped_urls ⇒ Array<String>
of url’s host.
-
#to_crawl_urls ⇒ Array<String>
of url’s host.
-
#url ⇒ String
first url parsed.
Instance Method Summary collapse
-
#crawl!(limit = 2) {|RecipeSraper::Recipe| ... } ⇒ Object
Start the crawl.
-
#get_links(url) ⇒ void
Get recipes links from the given url.
-
#initialize(url) ⇒ Crawler
constructor
Create a Crawler.
-
#save(recipe) ⇒ Boolean
Save recipe.
-
#scrape(url) ⇒ RecipeSraper::Recipe
Scrape given url param url [String] as url to scrape.
-
#url_valid? ⇒ Boolean
Check if the url can be parsed and set the host.
Constructor Details
#initialize(url) ⇒ Crawler
Create a Crawler
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/recipe_crawler/crawler.rb', line 37 def initialize url @url = url if url_valid? @recipes = [] @crawled_urls = [] @scraped_urls = [] @to_crawl_urls = [] @to_crawl_urls << url @db = SQLite3::Database.new "results.sqlite3" @db.execute "CREATE TABLE IF NOT EXISTS recipes( Id INTEGER PRIMARY KEY, title TEXT, preptime INTEGER, cooktime INTEGER, ingredients TEXT, steps TEXT, image TEXT )" else raise ArgumentError , 'This url cannot be used' end end |
Instance Attribute Details
#crawled_urls ⇒ Array<String>
of url’s host
22 23 24 |
# File 'lib/recipe_crawler/crawler.rb', line 22 def crawled_urls @crawled_urls end |
#db ⇒ SQLite3::Database
Sqlite database where recipe will be saved
22 23 24 |
# File 'lib/recipe_crawler/crawler.rb', line 22 def db @db end |
#host ⇒ Symbol
of url’s host
22 23 24 |
# File 'lib/recipe_crawler/crawler.rb', line 22 def host @host end |
#recipes ⇒ Array<RecipeSraper::Recipe>
recipes fetched
22 23 24 |
# File 'lib/recipe_crawler/crawler.rb', line 22 def recipes @recipes end |
#scraped_urls ⇒ Array<String>
of url’s host
22 23 24 |
# File 'lib/recipe_crawler/crawler.rb', line 22 def scraped_urls @scraped_urls end |
#to_crawl_urls ⇒ Array<String>
of url’s host
22 23 24 |
# File 'lib/recipe_crawler/crawler.rb', line 22 def to_crawl_urls @to_crawl_urls end |
#url ⇒ String
first url parsed
22 23 24 |
# File 'lib/recipe_crawler/crawler.rb', line 22 def url @url end |
Instance Method Details
#crawl!(limit = 2) {|RecipeSraper::Recipe| ... } ⇒ Object
Start the crawl
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/recipe_crawler/crawler.rb', line 81 def crawl! limit=2 # find all link on url given (and urls of theses) if @host == :cuisineaz while !@to_crawl_urls.empty? get_links to_crawl_urls[0] break if @crawled_urls.count > limit end else raise NotImplementedError end # scrap urls recipes_returned = 0 @crawled_urls.each{ |crawled_url| if limit > recipes_returned yield scrape crawled_url recipes_returned += 1 else break end } if block_given? end |
#get_links(url) ⇒ void
This method returns an undefined value.
Get recipes links from the given url
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# File 'lib/recipe_crawler/crawler.rb', line 128 def get_links url # catch 404 error from host begin doc = Nokogiri::HTML(open(url)) # find internal links on page doc.css('#tagCloud a').each do |link| link = link.attr('href') # If link correspond to a recipe we add it to recipe to scraw if link.include?(ALLOWED_URLS[@host]) and !@crawled_urls.include?(url) @to_crawl_urls << link end end @to_crawl_urls.delete url @crawled_urls << url @to_crawl_urls.uniq! rescue OpenURI::HTTPError @to_crawl_urls.delete url warn "#{url} cannot be reached" end end |
#save(recipe) ⇒ Boolean
Save recipe
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# File 'lib/recipe_crawler/crawler.rb', line 156 def save recipe begin @db.execute "INSERT INTO recipes (title, preptime, cooktime, ingredients, steps, image) VALUES (:title, :preptime, :cooktime, :ingredients, :steps, :image)", title: recipe.title, preptime: recipe.preptime, ingredients: recipe.ingredients.join("\n"), steps: recipe.steps.join("\n"), image: recipe.image return true rescue SQLite3::Exception => e puts "Exception occurred #{e}" return false end end |
#scrape(url) ⇒ RecipeSraper::Recipe
Scrape given url param url [String] as url to scrape
111 112 113 114 115 116 117 118 119 120 |
# File 'lib/recipe_crawler/crawler.rb', line 111 def scrape url recipe = RecipeSraper::Recipe.new url @scraped_urls << url @recipes << recipe if save recipe return recipe else raise SQLite3::Exception, 'accnot save recipe' end end |
#url_valid? ⇒ Boolean
Check if the url can be parsed and set the host
65 66 67 68 69 70 71 72 73 |
# File 'lib/recipe_crawler/crawler.rb', line 65 def url_valid? ALLOWED_URLS.each do |host, url_allowed| if url.include? url_allowed @host = host return true end end return false end |