Module: SL::URL

Included in:
SL
Defined in:
lib/searchlink/url.rb

Class Method Summary collapse

Class Method Details

.amazon_affiliatize(url, amazon_partner) ⇒ Object



90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/searchlink/url.rb', line 90

def amazon_affiliatize(url, amazon_partner)
  return url if amazon_partner.nil? || amazon_partner.empty?

  unless url =~ %r{https?://(?<subdomain>.*?)amazon.com/(?:(?<title>.*?)/)?(?<type>[dg])p/(?<id>[^?]+)}
    return [url, '']
  end

  m = Regexp.last_match
  sd = m['subdomain']
  title = m['title'].gsub(/-/, ' ')
  t = m['type']
  id = m['id']
  ["https://#{sd}amazon.com/#{t}p/#{id}/?ref=as_li_ss_tl&ie=UTF8&linkCode=sl1&tag=#{amazon_partner}", title]
end

.only_url?(input) ⇒ Boolean

Returns:

  • (Boolean)


48
49
50
# File 'lib/searchlink/url.rb', line 48

def only_url?(input)
  input =~ %r{(?i)^((http|https)://)?([\w\-_]+(\.[\w\-_]+)+)([\w\-.,@?^=%&amp;:/~+#]*[\w\-@^=%&amp;/~+#])?$}
end

.ref_title_for_url(url) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/searchlink/url.rb', line 52

def ref_title_for_url(url)
  url = URI.parse(url) if url.is_a?(String)

  parts = url.hostname.split(/\./)
  domain = if parts.count > 1
             parts.slice(-2, 1).join('')
           else
             parts.join('')
           end

  path = url.path.split(%r{/}).last
  if path
    path.gsub!(/-/, ' ').gsub!(/\.\w{2-4}$/, '')
  else
    path = domain
  end

  path.length > domain.length ? path : domain
end

.title(url) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/searchlink/url.rb', line 105

def title(url)
  title = nil

  ## Gather proving too inexact
  # gather = false
  # ['/usr/local/bin', '/opt/homebrew/bin'].each do |root|
  #   if File.exist?(File.join(root, 'gather')) && File.executable?(File.join(root, 'gather'))
  #     gather = File.join(root, 'gather')
  #     break
  #   end
  # end

  # if gather
  #   cmd = %(#{gather} --title-only '#{url.strip}' --fallback-title 'Unknown')
  #   title = SL::Util.exec_with_timeout(cmd, 15)
  #   if title
  #     title = title.strip.gsub(/\n+/, ' ').gsub(/ +/, ' ')
  #     title.remove_seo!(url) if SL.config['remove_seo']
  #     return title.remove_protocol
  #   else
  #     SL.add_error('Error retrieving title', "Gather timed out on #{url}")
  #     SL.notify('Error retrieving title', 'Gather timed out')
  #   end
  # end

  begin
    page = Curl::Html.new(url)

    title = page.title || nil

    if title.nil? || title =~ /^\s*$/
      SL.add_error('Title not found', "Warning: missing title for #{url.strip}")
      title = url.gsub(%r{(^https?://|/.*$)}, '').gsub(/-/, ' ').strip
    else
      title = title.gsub(/\n/, ' ').gsub(/\s+/, ' ').strip # .sub(/[^a-z]*$/i,'')
      title.remove_seo!(url) if SL.config['remove_seo']
    end
    title.gsub!(/\|/, '')
    title.remove_seo!(url.strip) if SL.config['remove_seo']
    title.remove_protocol
  rescue StandardError
    SL.add_error('Error retrieving title', "Error determining title for #{url.strip}")
    warn "Error retrieving title for #{url.strip}"
    url.remove_protocol
  end
end

.url?(input) ⇒ Boolean

Returns:

  • (Boolean)


44
45
46
# File 'lib/searchlink/url.rb', line 44

def url?(input)
  input =~ %r{^(#.*|https?://\S+|/\S+|\S+/|[^!]\S+\.\S+)(\s+".*?")?$}
end


72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/searchlink/url.rb', line 72

def url_to_link(url, type)
  input = url.dup

  if only_url?(input)
    input.sub!(%r{(?mi)^(?!https?://)(.*?)$}, 'https://\1')
    url = URI.parse(input.downcase)

    title = if type == :ref_title
              ref_title_for_url(url)
            else
              title(url.to_s) || input.sub(%r{^https?://}, '')
            end

    return [url.to_s, title] if url.hostname
  end
  false
end

.valid_link?(uri_str, limit = 5) ⇒ Boolean

Validates that a link exists and returns 200

Returns:

  • (Boolean)


5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/searchlink/url.rb', line 5

def valid_link?(uri_str, limit = 5)
  return false unless uri_str

  SL.notify('Validating', uri_str)
  return false if limit.zero?

  url = URI(uri_str)
  return true unless url.scheme

  url.path = '/' if url.path == ''
  # response = Net::HTTP.get_response(URI(uri_str))
  response = false

  Net::HTTP.start(url.host, url.port, use_ssl: url.scheme == 'https') do |http|
    response = http.request_head(url.path)
  end

  case response
  when Net::HTTPMethodNotAllowed, Net::HTTPServiceUnavailable
    unless /amazon\.com/ =~ url.host
      SL.add_error('link validation', "Validation blocked: #{uri_str} (#{e})")
    end
    SL.notify('Error validating', uri_str)
    true
  when Net::HTTPSuccess
    true
  when Net::HTTPRedirection
    location = response['location']
    valid_link?(location, limit - 1)
  else
    SL.notify('Error validating', uri_str)
    false
  end
rescue StandardError => e
  SL.notify('Error validating', uri_str)
  SL.add_error('link validation', "Possibly invalid => #{uri_str} (#{e})")
  true
end