Module: UrlCommon

Defined in:
lib/url_common.rb,
lib/url_common/version.rb

Defined Under Namespace

Classes: Error

Constant Summary collapse

VERSION =
"0.1.6.3"

Class Method Summary collapse

Class Method Details

.agentObject



125
126
127
# File 'lib/url_common.rb', line 125

def self.agent
  return Mechanize.new
end

.check_for_404(url, elixir_style = false) ⇒ Object

TODO needs tests



264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# File 'lib/url_common.rb', line 264

def self.check_for_404(url, elixir_style = false)
  agent = Mechanize.new
  results = []

  begin
    head_result = agent.head(url)
    return OpenStruct.new(:url => url, :status => 200) if elixir_style == false
    return :ok, url if elixir_style
  rescue StandardError => e
    if e.to_s =~ /404/
      return OpenStruct.new(:url => url, :error => e, :status => 404)
    else
      return OpenStruct.new(:url => url, :error => e, :status => 404)        
    end
  end
end

.check_for_amazon_referrer(url, referrer_code) ⇒ Object

tested #www.amazon.com/gp/product/B01DT4A2R4/ref=as_li_qf_sp_asin_il_tl?ie=UTF8&tag=nickjanetakis-20&camp=1789&creative=9325&linkCode=as2&creativeASIN=B01DT4A2R4&linkId=496be5e222b6291369c0a393c797c2c0 returns nil if link isn’t amazon at all returns true if link is amazon and has referrer code returns false if link is amazon and doesn’t have referrer code



165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/url_common.rb', line 165

def self.check_for_amazon_referrer(url, referrer_code)
#def check_for_amazon_referrer(url, referrer_code)
  #https://github.com/gamache/fuzzyurl.rb
  fu = Fuzzyurl.from_string(url)
  return nil if fu.hostname.nil? 
  base_domain = fu.hostname.sub(/^www./,'')
  # base_domain = UrlCommon.get_base_domain
  parts = base_domain.split(".")
  return nil if parts[0] != "amazon"
  #referer_code = self.account.user.details[:amazon_referrer_code]
  if url =~ /#{referrer_code}/
    return true
  else
    return false
  end
end

TODO needs tests



282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# File 'lib/url_common.rb', line 282

def self.check_for_broken_links(links)
  results = []
  agent = Mechanize.new
  links.each do |link|
    begin
      result = agent.head(link.href)
      results << OpenStruct.new(:url => link.href, :status => 200)
    rescue StandardError => e
      if e.to_s =~ /404/
        results << OpenStruct.new(:url => link.href, :error => e, :status => 404)
      end
    end
  end
  #debugger
  results
end


110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/url_common.rb', line 110

def self.count_links(html)
  if html =~ /<html/i
    content_type = "html"
  else
    content_type = "ascii"
  end
  parts = html.split(" ")
  link_ctr = 0
  parts.each do |part|
    link_ctr = link_ctr + 1 if part =~ /https:?\/\// && content_type == 'ascii'
    link_ctr = link_ctr + 1 if part =~ /<a [^>]+.+<\/a>/i && content_type == 'html'
  end
  link_ctr
end

.create_mechanize_page_from_html(url, html) ⇒ Object

TODO needs tests



330
331
332
333
334
335
336
# File 'lib/url_common.rb', line 330

def self.create_mechanize_page_from_html(url, html)
  mechanize_page = Mechanize::Page.new(nil, {'content-type'=>'text/html'}, html, nil, Mechanize.new)
  url = url.gsub(/ /,'%20')
  mechanize_page.uri = URI.parse(url)    
  
  return mechanize_page
end

todo expand on this by checking to make sure it doesn’t call again should be fine but ttfn



585
586
587
588
# File 'lib/url_common.rb', line 585

def self.create_permalink(tbl, fld, url)
  short_url = UrlCommon.generate_short_url
  sanitized_url = UrlCommon.sanitize(url, "https://pullquotes.io/pull_quotes/")
end

.discover_feed_url(site_url, debug = false) ⇒ Object

500hats.com/feed UrlCommon.discover_feed_url(“nickjanetakis.com”)



479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
# File 'lib/url_common.rb', line 479

def self.discover_feed_url(site_url, debug = false)
  # step 1: remove the file from the site_url if it has one
  # step 2: problem the common ones and 404 check
  
  #
  # Build a set of possibles
  #
  possible_rssurls = UrlCommon.possible_rssurls(site_url)
  
  #
  # Keep track of failures
  #
  failed_probes = Set.new
  
  # step 3: parse the html
  #<link rel="alternate" type="application/rss+xml" href="http://scripting.com/rss.xml" />
  #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Feed" href="https://ma.tt/feed/" />
  #<link rel="alternate" type="application/rss+xml" title="Matt Mullenweg &raquo; Comments Feed" href="https://ma.tt/comments/feed/" />
  
  #
  # Stage 1 -- do http head probing
  #
  possible_rssurls.each do |rssurl|
    puts "Head Probing for: #{rssurl}" if debug
    
    # abort if we doubled blog i.e. /blog/blog/ in the url
    next if rssurl =~ /blog\/blog/
    next if failed_probes.include?(rssurl)
    
    status, url = UrlCommon.check_for_404(rssurl, true)    
    random_status, random_url = UrlCommon.test_random_url(site_url)
    #debugger 
    return rssurl if status == :ok && random_status == :ok
    failed_probes << rssurl
  end
  
  puts "After probe, failed_probes as: #{failed_probes.inspect}"
  
  #
  # Stage 2-- if subdirectory go up one level and probe again
  #
  # TODO
  
  
  
  #
  # Stage 3 -- Goto root and probe again 
  #
  #test for this is the nick site
  fuzzy_url_parts = Fuzzyurl.new(site_url)
  base_url = "#{fuzzy_url_parts.protocol}://#{fuzzy_url_parts.hostname}"
  possible_rssurls = UrlCommon.possible_rssurls(base_url)
  #debugger
  possible_rssurls.each do |rssurl|
    puts "Head Probing for: #{rssurl} at site root stage" #if debug
    
    # abort if we doubled blog i.e. /blog/blog/ in the url
    next if rssurl =~ /blog\/blog/
    next if failed_probes.include?(rssurl)
    
    status, url = UrlCommon.check_for_404(rssurl, true)    
    return rssurl if status == :ok
    failed_probes << rssurl
  end
  
  
  #
  # Stage 4 - parse the html
  #
  rssurl = UrlCommon.parse_html_for_rssurl_from_head(site_url, nil, true)
  return rssurl if rssurl
  
  #
  # Stage 5 - fall over to feedback
  #
  results = Feedbag.find(site_url)
  # checked_results = []
  # results.each do |result|
  #   struct = UrlCommon.check_for_404(result)
  #   checked_results << result if struct.status == 200
  # end
  
  #
  # Stage 6 - cache failures to redis so don't look for them again
  #
  #$redis.
  
  return UrlCommon.select_best_rssurl_from_rssurls(results)
end


361
362
363
364
365
366
# File 'lib/url_common.rb', line 361

def self.extract_links_from_text(text)
  agent = Mechanize.new
  html = "<HTML><BODY>#{text}</BODY></HTML>"
  page = Mechanize::Page.new(nil,{'content-type'=>'text/html'},html,nil,agent)
  return page.links
end

.fix_relative_url(base_url, partial_url) ⇒ Object



299
300
301
302
303
304
# File 'lib/url_common.rb', line 299

def self.fix_relative_url(base_url, partial_url)
  return partial_url if partial_url =~ /^http/
  parts = URI.parse(base_url)
  return parts.scheme + '://' +  parts.host + partial_url
  return File.join(base_url, partial_url)
end

.generate_fidObject

based on this approach: medium.com/@sparkboldstudio/building-a-url-shortener-rails-app-96db60d3bf9d def self.generate_short_fid def self.create_fid



572
573
574
# File 'lib/url_common.rb', line 572

def self.generate_fid
  rand(36**8).to_s(36)
end

.get_base_domain(url) ⇒ Object

Designed to get just the domain without the WWW



62
63
64
65
66
67
68
69
70
71
72
# File 'lib/url_common.rb', line 62

def self.get_base_domain(url)
  #debugger if url =~ /c06rh22whx1g/
  begin
    url = url.gsub(/ /,'%20')
    parts = URI.parse(url)
    return parts.host.gsub(/^www./,'')
  rescue StandardError => e
    fu = Fuzzyurl.from_string(url)
    return fu.hostname.gsub(/^www./,'')
  end
end

.get_meta_description(url, html) ⇒ Object

TODO needs tests



339
340
341
342
343
344
345
346
347
# File 'lib/url_common.rb', line 339

def self.get_meta_description(url, html)
  page = UrlCommon.create_mechanize_page_from_html(url, html)
  description = ""
  begin
    description = page.parser.at("meta[name='description']")['content']
  rescue StandardError => e
  end
  return description 
end

.get_page(url, return_html = false, user_agent = nil) ⇒ Object

TODO needs tests



199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# File 'lib/url_common.rb', line 199

def self.get_page(url, return_html = false, user_agent = nil)
  agent = Mechanize.new { |a| 
    if user_agent.nil?
      #a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:46.0) Gecko/20100101 Firefox/46.0"
      a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
    else
      a.user_agent = user_agent
    end
    #a.user_agent = "curl/7.54.0"
    #debugger
  }
  agent.verify_callback = Proc.new do |ok,x509|
    status = x509.error
    msg = x509.error_string
    logger.warn "server certificate verify: status: #{status}, msg: #{msg}" if status != 0
    true # this has the side effect of ignoring errors. nice!
  end 
  begin
    page = agent.get(url)
    if return_html
      return :ok, page.body
    else
      return :ok, page
    end
    #return :ok, page
  rescue StandardError => e
    return :error, e
  end
end

.get_page_title(url, html) ⇒ Object

TODO needs tests UrlCommon.get_page_title(“gist.github.com/fuzzygroup/811a9334b1a6dc394de74a23cb7e12fa”)



351
352
353
354
355
356
357
358
359
# File 'lib/url_common.rb', line 351

def self.get_page_title(url, html)
  page = UrlCommon.create_mechanize_page_from_html(url, html)
  title = ""
  begin
    title = page.parser.css('title').first.content
  rescue StandardError => e
  end
  return title 
end

.get_protocol(url) ⇒ Object



472
473
474
475
# File 'lib/url_common.rb', line 472

def self.get_protocol(url)
  parts = url.to_s.split(":")
  return parts.first
end

.get_root_domain(url) ⇒ Object

designed to get the root of the domain i.e. elias.slack.com => slack.com



75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/url_common.rb', line 75

def self.get_root_domain(url)
  begin
    url = url.gsub(/ /,'%20')
    url_parts = URI.parse(url)
    host_parts = url_parts.host.split('.')
  rescue StandardError => e
    fu = Fuzzyurl.from_string(url)
    host_parts = fu.hostname.split('.')
  end
  return host_parts[0] if host_parts.size == 1
  return "#{host_parts[host_parts.size - 2]}.#{host_parts.last}"
end

.has_own_domain?(url) ⇒ Boolean

TODO needs tests

def self.check_for_jekyll_subdomain?(url)

Returns:

  • (Boolean)


184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/url_common.rb', line 184

def self.has_own_domain?(url)
  return false if url =~ /\.github\.io/
  return false if url =~ /\.blogspot\.com/
  return false if url =~ /\.wordpress\.com/
  #return false if url =~ /\..+\./
  return true
  if site_url =~ /\..+\./
    return true
  else
    analysis_results << "You have a domain of your own; that's a great first step!"
  end

end

.is_valid?(url) ⇒ Boolean

UrlCommon.is_valid?(“fuzzyblog.io/blog/”) UrlCommon.is_valid?(“fuzzyblog.io/blog/”)

Returns:

  • (Boolean)


11
12
13
14
15
16
17
18
19
20
21
# File 'lib/url_common.rb', line 11

def self.is_valid?(url)
  begin
    result = Fuzzyurl.from_string(url)
    return false if result.hostname.nil?
    return false if result.protocol.nil?
    return false if (!result.hostname.include?('.')) && result.protocol.nil?
    return true
  rescue StandardError => e
    return false
  end
end

.join(base, rest, debug = false) ⇒ Object



88
89
90
# File 'lib/url_common.rb', line 88

def self.join(base, rest, debug = false)
  return URI.join(base, rest).to_s
end

.mpage_is_html?(page) ⇒ Boolean

def self.get_page_caching_attempt(url, return_html = false)

agent = Mechanize.new { |a|
  a.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:46.0) Gecko/20100101 Firefox/46.0"
}
agent.verify_callback = Proc.new do |ok,x509|
  status = x509.error
  msg = x509.error_string
  logger.warn "server certificate verify: status: #{status}, msg: #{msg}" if status != 0
  true # this has the side effect of ignoring errors. nice!
end
begin
  page = agent.get(url)
  if return_html
    Rails.cache.fetch(UrlCommon.sha_it(url), :expires_in => 1.hour) do
      page.body
    end
    # Rails.cache.fetch(UrlCommon.sha_it(url), :expires_in => 1.hour) do
    #   debugger
    #   page.body
    # end
    return :ok, page.body
  else
    return :ok, page
  end
rescue StandardError => e
  return :error, e
end

end

Returns:

  • (Boolean)


258
259
260
261
# File 'lib/url_common.rb', line 258

def self.mpage_is_html?(page)
  return true if page.respond_to?(:title)
  return false
end

.parse_country_from_itunes_url(url) ⇒ Object



44
45
46
47
48
49
50
51
# File 'lib/url_common.rb', line 44

def self.parse_country_from_itunes_url(url)
  country = /https?:\/\/itunes\.apple\.com\/(..)\//.match(url)
  if country
    country = country[1] 
  end
  return country if country
  return 'us'
end

.parse_fid_from_amazon_url(url) ⇒ Object



34
35
36
37
38
39
40
41
# File 'lib/url_common.rb', line 34

def self.parse_fid_from_amazon_url(url)
  tmp = /\/dp\/([A-Za-z0-9]+)/.match(url)
  if tmp && tmp[1]
    return tmp[1] 
  else
    return nil
  end
end

.parse_fid_from_itunes_url(url) ⇒ Object

UrlCommon.parse_fid_from_itunes_url(“itunes.apple.com/us/app/imovie/id408981434?mt=12”)



24
25
26
27
28
29
30
31
# File 'lib/url_common.rb', line 24

def self.parse_fid_from_itunes_url(url)
  tmp = /\/id([0-9]+)/.match(url)
  if tmp && tmp[1]
    return tmp[1] 
  else
    return nil
  end
end

.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false) ⇒ Object



425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
# File 'lib/url_common.rb', line 425

def self.parse_html_for_rssurl_from_head(site_url, page = nil, debug = false)
  if page
    status = :ok
  else
    status, page = UrlCommon.get_page(site_url)
  end
  puts "Into html parse for rssurl" if debug
  possibles = []
  if status == :ok && page
    #results = page.css("link[rel='alternate']")
    results = page.css("link[rel='alternate'][type='application/rss+xml']")
    #
    # If only a single one then return it
    #
    #return results.first['href'] if results.first['type'] =~ /application\/rss\+xml/i && results.size == 1
    return results.first['href'] if results.size == 1
    
    #
    # If an array then filter out the comments
    #
    results.each do |result|
      possibles << result unless result['title'] =~ /comments? feed/i
    end
    
    #
    # Loop over the possibles and just return the shortest url
    #
    # Todo -- can likely do a better job on this
    #
    urls = []
    possibles.each do |possible|
      urls << possible['href']
    end
    return UrlCommon.select_best_rssurl_from_rssurls(urls)
    #return urls.sort_by(&:length).first
    
    
    # results.each do |result|
    #
    #   end
    # end
    # doc = Nokogiri::HTML(page.body)
    # results << doc.at('link[rel="alternate"]')
    # results = results.flatten
  end
end

.possible_rssurls(site_url, skip_slash_blog = false) ⇒ Object



401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
# File 'lib/url_common.rb', line 401

def self.possible_rssurls(site_url, skip_slash_blog = false)
  # urls we will probe
  possible_rssurl_formats = []

  # normal baselines
  possible_rssurl_formats << "feed.xml"
  possible_rssurl_formats << "rss.xml"
  possible_rssurl_formats << "atom.xml"
  possible_rssurl_formats << "feed/"
  
  # optionally look at /blog/
  possible_rssurl_formats << "/blog/feed.xml"
  possible_rssurl_formats << "/blog/rss.xml" 
  possible_rssurl_formats << "/blog/atom.xml" 
  possible_rssurl_formats << "/blog/feed/" 
  
  possible_rssurls = []
  possible_rssurl_formats.each do |url_format|
    possible_rssurls << UrlCommon.join(site_url, url_format)      
  end
  
  return  possible_rssurls
end

.sanitize(long_url, url_base) ⇒ Object



576
577
578
579
580
581
# File 'lib/url_common.rb', line 576

def self.sanitize(long_url, url_base)
  long_url.strip!
  sanitize_url = self.long_url.downcase.gsub(/(https?:\/\/)|(www\.)/,"")
  #"http://#{sanitize_url}"
  url_base + sanitize_url
end

.select_best_rssurl_from_rssurls(urls) ⇒ Object



397
398
399
# File 'lib/url_common.rb', line 397

def self.select_best_rssurl_from_rssurls(urls)
  return urls.sort_by(&:length).first
end

.strip_a_tag(a_tag) ⇒ Object



129
130
131
132
# File 'lib/url_common.rb', line 129

def self.strip_a_tag(a_tag)
  #<a href="https://www.keyingredient.com/recipes/12194051/egg-salad-best-ever-creamy/">
  return a_tag.sub(/<a href=[\"']/,'').sub(/[\"']>/,'')
end

.summarize_url(url) ⇒ Object



369
370
371
372
373
374
# File 'lib/url_common.rb', line 369

def self.summarize_url(url)
  #GET /summarize?url=http://www.bbc.com/sport/0/football/25912393
  agent = Mechanize.new
  summarization_url = ""
  page = agent.get(url)
end

.test_random_url(url_or_host) ⇒ Object

fucking idiotic test case for this fucking idiot is: devslopes.com/



377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
# File 'lib/url_common.rb', line 377

def self.test_random_url(url_or_host)
  random_filename = TextCommon.sha(Time.now.to_s) + ".xml"
  if url_or_host =~ /http/
    url = File.join(url_or_host, random_filename)
  else
    url = File.join("http://", host, random_filename)      
  end
  status, url = UrlCommon.check_for_404(url, true)   
  #
  # Key bit of logic -- if we get a return value for a randomized sha then that means that
  # a) the destination site owner is a fucking moron
  # b) that the destination site owner has set his site so it NEVER returns a 404
  # c) they're a fucking moron
  # d) if I get a 200 back then it means that they return you to the home page for anything and NOT
  #    a proper 404 so need to flip flop the logic and return error on a 200; sheesh
  #
  return :error, url if status == :ok
  return :ok, url
end

.url_base(url, base_domain = nil) ⇒ Object

Returns a url w/o wwww UrlCommon.url_base(“www.udemy.com/the-build-a-saas-app-with-flask-course/”) “udemy.com/the-build-a-saas-app-with-flask-course/”



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/url_common.rb', line 140

def self.url_base(url, base_domain=nil)
  if base_domain.nil?
    base_domain = UrlCommon.get_base_domain(url)
  end
  begin
    url = url.gsub(/ /,'%20')      
    parts = URI.parse(url)
    extra = ""
    extra = "?#{parts.query}" if parts.query
    url_base = "#{base_domain}#{parts.path}#{extra}"
    return url_base[0..254]
  rescue StandardError => e
    fu = Fuzzyurl.from_string(url)
    base_domain = UrlCommon.get_base_domain(url)
    extra = ""
    extra = "?#{fu.query}" if fu.query
    url_base = "#{base_domain}#{fu.path}#{extra}"
    return url_base[0..254]
  end
end

.url_no_www(url) ⇒ Object

UrlCommon.url_no_www(“www.youtube.com/watch?v=4Mj_zDw21xY”) UrlCommon.url_no_www(“portal.azure.com/#home”)



94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/url_common.rb', line 94

def self.url_no_www(url)
  parts = Fuzzyurl.new(url)
  if parts.query
    #return parts.hostname.sub(/^www\./, '') + parts.try(:path) + '?' + parts.query 
    return parts.hostname.sub(/^www\./, '') + parts&.path + '?' + parts.query 
  else
    if parts.fragment
      return parts.hostname.sub(/^www\./, '') + parts&.path.to_s + "##{parts&.fragment}"
    else
      return parts.hostname.sub(/^www\./, '') + parts&.path.to_s 
    end
    #byebug
    #return parts.hostname.sub(/^www\./, '') + parts.try(:path).to_s
  end
end

.validate_with_merge_fragment(url, merge_fragment) ⇒ Object

status, url = UrlCommon.validate_with_merge_fragment(“nickjj/orats”, “www.github.com/”)



307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/url_common.rb', line 307

def self.validate_with_merge_fragment(url, merge_fragment)
  #
  # verify it is a valid url and it isn't a 404 or redirect
  #
  if is_valid?(url) && check_for_404(url)
    return true, url 
  end

  #
  # Try and make it valid
  #
  if url =~ /^http/
    # if its invalid and has http then don't know what to do so return false
    return false, url
  end

  url = File.join(merge_fragment, url)
  if is_valid?(url) && check_for_404(url)
    return true, url
  end        
end