Module: UrlCommon

Defined in:
lib/url_common.rb,
lib/url_common/version.rb

Defined Under Namespace

Classes: Error

Constant Summary collapse

VERSION =
"0.1.6.1"

Class Method Summary collapse

Class Method Details

.agentObject



120
121
122
# File 'lib/url_common.rb', line 120

def self.agent
  return Mechanize.new
end

.check_for_404(url, elixir_style = false) ⇒ Object

TODO needs tests



259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/url_common.rb', line 259

def self.check_for_404(url, elixir_style = false)
  agent = Mechanize.new
  results = []

  begin
    head_result = agent.head(url)
    return OpenStruct.new(:url => url, :status => 200) if elixir_style == false
    return :ok, url if elixir_style
  rescue StandardError => e
    if e.to_s =~ /404/
      return OpenStruct.new(:url => url, :error => e, :status => 404)
    else
      return OpenStruct.new(:url => url, :error => e, :status => 404)        
    end
  end
end

.check_for_amazon_referrer(url, referrer_code) ⇒ Object

tested #www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0 returns nil if link isn’t amazon at all returns true if link is amazon and has referrer code returns false if link is amazon and doesn’t have referrer code



160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# File 'lib/url_common.rb', line 160

def self.check_for_amazon_referrer(url, referrer_code)
#def check_for_amazon_referrer(url, referrer_code)
  #https://github.com/gamache/fuzzyurl.rb
  fu = Fuzzyurl.from_string(url)
  return nil if fu.hostname.nil? 
  base_domain = fu.hostname.sub(/^www./,'')
  # base_domain = UrlCommon.get_base_domain
  parts = base_domain.split(".")
  return nil if parts[0] != "amazon"
  #referer_code = self.account.user.details[:amazon_referrer_code]
  if url =~ /#{referrer_code}/
    return true
  else
    return false
  end
end

TODO needs tests



277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# File 'lib/url_common.rb', line 277

def self.check_for_broken_links(links)
  results = []
  agent = Mechanize.new
  links.each do |link|
    begin
      result = agent.head(link.href)
      results << OpenStruct.new(:url => link.href, :status => 200)
    rescue StandardError => e
      if e.to_s =~ /404/
        results << OpenStruct.new(:url => link.href, :error => e, :status => 404)
      end
    end
  end
  #debugger
  results
end


105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/url_common.rb', line 105

def self.count_links(html)
  if html =~ /<html/i
    content_type = "html"
  else
    content_type = "ascii"
  end
  parts = html.split(" ")
  link_ctr = 0
  parts.each do |part|
    link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
    link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
  end
  link_ctr
end

.create_mechanize_page_from_html(url, html) ⇒ Object

TODO needs tests



325
326
327
328
329
330
331
# File 'lib/url_common.rb', line 325

def self.create_mechanize_page_from_html(url, html)
  mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
  url = url.gsub(/ /,'%20')
  mechanize_page.uri = URI.parse(url)    
  
  return mechanize_page
end

todo expand on this by checking to make sure it doesn’t call again should be fine but ttfn



580
581
582
583
# File 'lib/url_common.rb', line 580

def self.create_permalink(tbl, fld, url)
  short_url = UrlCommon.generate_short_url
  sanitized_url = UrlCommon.sanitize(url, "https://pullquotes.io/pull_quotes/")
end

.discover_feed_url(site_url, debug = false) ⇒ Object

500hats.com/feed UrlCommon.discover_feed_url(“nickjanetakis.com”)



474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
# File 'lib/url_common.rb', line 474

def self.discover_feed_url(site_url, debug = false)
  # step 1: remove the file from the site_url if it has one
  # step 2: problem the common ones and 404 check
  
  #
  # Build a set of possibles
  #
  possible_rssurls = UrlCommon.possible_rssurls(site_url)
  
  #
  # Keep track of failures
  #
  failed_probes = Set.new
  
  # step 3: parse the html
  #<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
  #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Feed" href="https://ma.tt/feed/" />
  #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Comments Feed" href="https://ma.tt/comments/feed/" />
  
  #
  # Stage 1 -- do http head probing
  #
  possible_rssurls.each do |rssurl|
    puts "Head Probing for: #{rssurl}" if debug
    
    # abort if we doubled blog i.e. /blog/blog/ in the url
    next if rssurl =~ /blog\/blog/
    next if failed_probes.include?(rssurl)
    
    status, url = UrlCommon.check_for_404(rssurl, true)    
    random_status, random_url = UrlCommon.test_random_url(site_url)
    #debugger 
    return rssurl if status == :ok && random_status == :ok
    failed_probes << rssurl
  end
  
  puts "After probe, failed_probes as: #{failed_probes.inspect}"
  
  #
  # Stage 2-- if subdirectory go up one level and probe again
  #
  # TODO
  
  
  
  #
  # Stage 3 -- Goto root and probe again 
  #
  #test for this is the nick site
  fuzzy_url_parts = Fuzzyurl.new(site_url)
  base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
  possible_rssurls = UrlCommon.possible_rssurls(base_url)
  #debugger
  possible_rssurls.each do |rssurl|
    puts "Head Probing for: #{rssurl} at site root stage" #if debug
    
    # abort if we doubled blog i.e. /blog/blog/ in the url
    next if rssurl =~ /blog\/blog/
    next if failed_probes.include?(rssurl)
    
    status, url = UrlCommon.check_for_404(rssurl, true)    
    return rssurl if status == :ok
    failed_probes << rssurl
  end
  
  
  #
  # Stage 4 - parse the html
  #
  rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
  return rssurl if rssurl
  
  #
  # Stage 5 - fall over to feedback
  #
  results = Feedbag.find(site_url)
  # checked_results = []
  # results.each do |result|
  #   struct = UrlCommon.check_for_404(result)
  #   checked_results << result if struct.status == 200
  # end
  
  #
  # Stage 6 - cache failures to redis so don't look for them again
  #
  #$redis.
  
  return UrlCommon.select_best_rssurl_from_rssurls(results)
end


356
357
358
359
360
361
# File 'lib/url_common.rb', line 356

def self.extract_links_from_text(text)
  agent = Mechanize.new
  html = "<HTML><BODY>#{text}</BODY></HTML>"
  page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
  return page.links
end

.fix_relative_url(base_url, partial_url) ⇒ Object



294
295
296
297
298
299
# File 'lib/url_common.rb', line 294

def self.fix_relative_url(base_url, partial_url)
  return partial_url if partial_url =~ /^http/
  parts = URI.parse(base_url)
  return parts.scheme + '://' +  parts.host + partial_url
  return File.join(base_url, partial_url)
end

.generate_fidObject

based on this approach: medium.com/@sparkboldstudio/building-a-url-shortener-rails-app-96db60d3bf9d def self.generate_short_fid def self.create_fid



567
568
569
# File 'lib/url_common.rb', line 567

def self.generate_fid
  rand(36**8).to_s(36)
end

.get_base_domain(url) ⇒ Object

Designed to get just the domain without the WWW



62
63
64
65
66
67
68
69
70
71
72
# File 'lib/url_common.rb', line 62

def self.get_base_domain(url)
  #debugger if url =~ /c06rh22whx1g/
  begin
    url = url.gsub(/ /,'%20')
    parts = URI.parse(url)
    return parts.host.gsub(/^www./,'')
  rescue StandardError => e
    fu = Fuzzyurl.from_string(url)
    return fu.hostname.gsub(/^www./,'')
  end
end

.get_meta_description(url, html) ⇒ Object

TODO needs tests



334
335
336
337
338
339
340
341
342
# File 'lib/url_common.rb', line 334

def self.get_meta_description(url, html)
  page = UrlCommon.create_mechanize_page_from_html(url, html)
  description = ""
  begin
    description = page.parser.at("meta[name='description']")['content']
  rescue StandardError => e
  end
  return description 
end

.get_page(url, return_html = false, user_agent = nil) ⇒ Object

TODO needs tests



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# File 'lib/url_common.rb', line 194

def self.get_page(url, return_html = false, user_agent = nil)
  agent = Mechanize.new { |a| 
    if user_agent.nil?
      #a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:46.0) Gecko/20100101 Firefox/46.0"
      a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
    else
      a.user_agent = user_agent
    end
    #a.user_agent = "curl/7.54.0"
    #debugger
  }
  agent.verify_callback = Proc.new do |ok,x509|
    status = x509.error
    msg = x509.error_string
    logger.warn "server certificate verify: status: #{status}, msg: #{msg}" if status != 0
    true # this has the side effect of ignoring errors. nice!
  end 
  begin
    page = agent.get(url)
    if return_html
      return :ok, page.body
    else
      return :ok, page
    end
    #return :ok, page
  rescue StandardError => e
    return :error, e
  end
end

.get_page_title(url, html) ⇒ Object

TODO needs tests UrlCommon.get_page_title(“gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa”)



346
347
348
349
350
351
352
353
354
# File 'lib/url_common.rb', line 346

def self.get_page_title(url, html)
  page = UrlCommon.create_mechanize_page_from_html(url, html)
  title = ""
  begin
    title = page.parser.css('title').first.content
  rescue StandardError => e
  end
  return title 
end

.get_protocol(url) ⇒ Object



467
468
469
470
# File 'lib/url_common.rb', line 467

def self.get_protocol(url)
  parts = url.to_s.split(":")
  return parts.first
end

.get_root_domain(url) ⇒ Object

designed to get the root of the domain i.e. elias.slack.com => slack.com



75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/url_common.rb', line 75

def self.get_root_domain(url)
  begin
    url = url.gsub(/ /,'%20')
    url_parts = URI.parse(url)
    host_parts = url_parts.host.split('.')
  rescue StandardError => e
    fu = Fuzzyurl.from_string(url)
    host_parts = fu.hostname.split('.')
  end
  return host_parts[0] if host_parts.size == 1
  return "#{host_parts[host_parts.size - 2]}.#{host_parts.last}"
end

.has_own_domain?(url) ⇒ Boolean

TODO needs tests

def self.check_for_jekyll_subdomain?(url)

Returns:

  • (Boolean)


179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/url_common.rb', line 179

def self.has_own_domain?(url)
  return false if url =~ /\.github\.io/
  return false if url =~ /\.blogspot\.com/
  return false if url =~ /\.wordpress\.com/
  #return false if url =~ /\..+\./
  return true
  if site_url =~ /\..+\./
    return true
  else
    analysis_results << "You have a domain of your own; that's a great first step!"
  end

end

.is_valid?(url) ⇒ Boolean

UrlCommon.is_valid?(“fuzzyblog.io/blog/”) UrlCommon.is_valid?(“fuzzyblog.io/blog/”)

Returns:

  • (Boolean)


11
12
13
14
15
16
17
18
19
20
21
# File 'lib/url_common.rb', line 11

def self.is_valid?(url)
  begin
    result = Fuzzyurl.from_string(url)
    return false if result.hostname.nil?
    return false if result.protocol.nil?
    return false if (!result.hostname.include?('.')) && result.protocol.nil?
    return true
  rescue StandardError => e
    return false
  end
end

.join(base, rest, debug = false) ⇒ Object



88
89
90
# File 'lib/url_common.rb', line 88

def self.join(base, rest, debug = false)
  return URI.join(base, rest).to_s
end

.mpage_is_html?(page) ⇒ Boolean

def self.get_page_caching_attempt(url, return_html = false)

agent = Mechanize.new { |a|
  a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:46.0) Gecko/20100101 Firefox/46.0"
}
agent.verify_callback = Proc.new do |ok,x509|
  status = x509.error
  msg = x509.error_string
  logger.warn "server certificate verify: status: #{status}, msg: #{msg}" if status != 0
  true # this has the side effect of ignoring errors. nice!
end
begin
  page = agent.get(url)
  if return_html
    Rails.cache.fetch(UrlCommon.sha_it(url), :expires_in => 1.hour) do
      page.body
    end
    # Rails.cache.fetch(UrlCommon.sha_it(url), :expires_in => 1.hour) do
    #   debugger
    #   page.body
    # end
    return :ok, page.body
  else
    return :ok, page
  end
rescue StandardError => e
  return :error, e
end

end

Returns:

  • (Boolean)


253
254
255
256
# File 'lib/url_common.rb', line 253

def self.mpage_is_html?(page)
  return true if page.respond_to?(:title)
  return false
end

.parse_country_from_itunes_url(url) ⇒ Object



44
45
46
47
48
49
50
51
# File 'lib/url_common.rb', line 44

def self.parse_country_from_itunes_url(url)
  country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
  if country
    country = country[1] 
  end
  return country if country
  return 'us'
end

.parse_fid_from_amazon_url(url) ⇒ Object



34
35
36
37
38
39
40
41
# File 'lib/url_common.rb', line 34

def self.parse_fid_from_amazon_url(url)
  tmp = /\/dp\/([A-Za-z0-9]+)/.match(url)
  if tmp && tmp[1]
    return tmp[1] 
  else
    return nil
  end
end

.parse_fid_from_itunes_url(url) ⇒ Object

UrlCommon.parse_fid_from_itunes_url(“itunes.apple.com/us/app/imovie/id408981434?mt=12”)



24
25
26
27
28
29
30
31
# File 'lib/url_common.rb', line 24

def self.parse_fid_from_itunes_url(url)
  tmp = /\/id([0-9]+)/.match(url)
  if tmp && tmp[1]
    return tmp[1] 
  else
    return nil
  end
end

.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false) ⇒ Object



420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
# File 'lib/url_common.rb', line 420

def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
  if page
    status = :ok
  else
    status, page = UrlCommon.get_page(site_url)
  end
  puts "Into html parse for rssurl" if debug
  possibles = []
  if status == :ok && page
    #results = page.css("link[rel='alternate']")
    results = page.css("link[rel='alternate'][type='application/rss+xml']")
    #
    # If only a single one then return it
    #
    #return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
    return results.first['href'] if results.size == 1
    
    #
    # If an array then filter out the comments
    #
    results.each do |result|
      possibles << result unless result['title'] =~ /comments? feed/i
    end
    
    #
    # Loop over the possibles and just return the shortest url
    #
    # Todo -- can likely do a better job on this
    #
    urls = []
    possibles.each do |possible|
      urls << possible['href']
    end
    return UrlCommon.select_best_rssurl_from_rssurls(urls)
    #return urls.sort_by(&:length).first
    
    
    # results.each do |result|
    #
    #   end
    # end
    # doc = Nokogiri::HTML(page.body)
    # results << doc.at('link[rel="alternate"]')
    # results = results.flatten
  end
end

.possible_rssurls(site_url, skip_slash_blog = false) ⇒ Object



396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
# File 'lib/url_common.rb', line 396

def self.possible_rssurls(site_url, skip_slash_blog = false)
  # urls we will probe
  possible_rssurl_formats = []

  # normal baselines
  possible_rssurl_formats << "feed.xml"
  possible_rssurl_formats << "rss.xml"
  possible_rssurl_formats << "atom.xml"
  possible_rssurl_formats << "feed/"
  
  # optionally look at /blog/
  possible_rssurl_formats << "/blog/feed.xml"
  possible_rssurl_formats << "/blog/rss.xml" 
  possible_rssurl_formats << "/blog/atom.xml" 
  possible_rssurl_formats << "/blog/feed/" 
  
  possible_rssurls = []
  possible_rssurl_formats.each do |url_format|
    possible_rssurls << UrlCommon.join(site_url, url_format)      
  end
  
  return  possible_rssurls
end

.sanitize(long_url, url_base) ⇒ Object



571
572
573
574
575
576
# File 'lib/url_common.rb', line 571

def self.sanitize(long_url, url_base)
  long_url.strip!
  sanitize_url = self.long_url.downcase.gsub(/(https?:\/\/)|(www\.)/,"")
  #"http://#{sanitize_url}"
  url_base + sanitize_url
end

.select_best_rssurl_from_rssurls(urls) ⇒ Object



392
393
394
# File 'lib/url_common.rb', line 392

def self.select_best_rssurl_from_rssurls(urls)
  return urls.sort_by(&:length).first
end

.strip_a_tag(a_tag) ⇒ Object



124
125
126
127
# File 'lib/url_common.rb', line 124

def self.strip_a_tag(a_tag)
  #<a href="https://www.keyingredient.com/recipes/12194051/egg-salad-best-ever-creamy/">
  return a_tag.sub(/<a href=[\"']/,'').sub(/[\"']>/,'')
end

.summarize_url(url) ⇒ Object



364
365
366
367
368
369
# File 'lib/url_common.rb', line 364

def self.summarize_url(url)
  #GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
  agent = Mechanize.new
  summarization_url = ""
  page = agent.get(url)
end

.test_random_url(url_or_host) ⇒ Object

fucking idiotic test case for this fucking idiot is: devslopes.com/



372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
# File 'lib/url_common.rb', line 372

def self.test_random_url(url_or_host)
  random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
  if url_or_host =~ /http/
    url = File.join(url_or_host, random_filename)
  else
    url = File.join("http://", host, random_filename)      
  end
  status, url = UrlCommon.check_for_404(url, true)   
  #
  # Key bit of logic -- if we get a return value for a randomized sha then that means that
  # a) the destination site owner is a fucking moron
  # b) that the destination site owner has set his site so it NEVER returns a 404
  # c) they're a fucking moron
  # d) if I get a 200 back then it means that they return you to the home page for anything and NOT
  #    a proper 404 so need to flip flop the logic and return error on a 200; sheesh
  #
  return :error, url if status == :ok
  return :ok, url
end

.url_base(url, base_domain = nil) ⇒ Object

Returns a url w/o wwww UrlCommon.url_base(“www.udemy.com/the-build-a-saas-app-with-flask-course/”) “udemy.com/the-build-a-saas-app-with-flask-course/”



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/url_common.rb', line 135

def self.url_base(url, base_domain=nil)
  if base_domain.nil?
    base_domain = UrlCommon.get_base_domain(url)
  end
  begin
    url = url.gsub(/ /,'%20')      
    parts = URI.parse(url)
    extra = ""
    extra = "?#{parts.query}" if parts.query
    url_base = "#{base_domain}#{parts.path}#{extra}"
    return url_base[0..254]
  rescue StandardError => e
    fu = Fuzzyurl.from_string(url)
    base_domain = UrlCommon.get_base_domain(url)
    extra = ""
    extra = "?#{fu.query}" if fu.query
    url_base = "#{base_domain}#{fu.path}#{extra}"
    return url_base[0..254]
  end
end

.url_no_www(url) ⇒ Object

UrlCommon.url_no_www(“www.youtube.com/watch?v=4Mj_zDw21xY”)



93
94
95
96
97
98
99
100
101
102
103
# File 'lib/url_common.rb', line 93

def self.url_no_www(url)
  parts = Fuzzyurl.new(url)
  if parts.query
    #return parts.hostname.sub(/^www\./, '') + parts.try(:path) + '?' + parts.query 
    return parts.hostname.sub(/^www\./, '') + parts&.path + '?' + parts.query 
  else
    #byebug
    #return parts.hostname.sub(/^www\./, '') + parts.try(:path).to_s
    return parts.hostname.sub(/^www\./, '') + parts&.path.to_s
  end
end

.validate_with_merge_fragment(url, merge_fragment) ⇒ Object

status, url = UrlCommon.validate_with_merge_fragment(“nickjj/orats”, “www.github.com/”)



302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# File 'lib/url_common.rb', line 302

def self.validate_with_merge_fragment(url, merge_fragment)
  #
  # verify it is a valid url and it isn't a 404 or redirect
  #
  if is_valid?(url) && check_for_404(url)
    return true, url 
  end

  #
  # Try and make it valid
  #
  if url =~ /^http/
    # if its invalid and has http then don't know what to do so return false
    return false, url
  end

  url = File.join(merge_fragment, url)
  if is_valid?(url) && check_for_404(url)
    return true, url
  end        
end