Class: String

Inherits:
Object
  • Object
show all
Defined in:
lib/searchlink/semver.rb,
lib/searchlink/string.rb,
lib/searchlink/curl/html.rb,
lib/searchlink/searches/hook.rb

Overview

Hookmark String helpers

Instance Method Summary collapse

Instance Method Details

#cleanString

Remove newlines, escape quotes, and remove Google Analytics strings

Returns:

  • (String)

    cleaned URL/String



116
117
118
119
120
121
122
# File 'lib/searchlink/string.rb', line 116

def clean
  gsub(/\n+/, ' ')
    .gsub(/"/, '&quot')
    .gsub(/\|/, '-')
    .gsub(/([&?]utm_[scm].+=[^&\s!,.)\]]++?)+(&.*)/, '\2')
    .sub(/\?&/, '').strip
end

#close_punctuationString

Complete incomplete punctuation pairs

Returns:

  • (String)

    string with all punctuation properly paired



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/searchlink/string.rb', line 184

def close_punctuation
  return self unless self =~ /[“‘\[(<]/

  words = split(/\s+/)

  punct_chars = {
    '“' => '”',
    '‘' => '’',
    '[' => ']',
    '(' => ')',
    '<' => '>'
  }

  left_punct = []

  words.each do |w|
    punct_chars.each do |k, v|
      left_punct.push(k) if w =~ /#{Regexp.escape(k)}/
      left_punct.delete_at(left_punct.rindex(k)) if w =~ /#{Regexp.escape(v)}/
    end
  end

  tail = ''
  left_punct.reverse.each { |c| tail += punct_chars[c] }

  gsub(/[^a-z)\]’”.…]+$/i, '...').strip + tail
end

#close_punctuation!Object

Destructive punctuation close

See Also:



174
175
176
# File 'lib/searchlink/string.rb', line 174

def close_punctuation!
  replace close_punctuation
end

#code_indentString

Indent each line of string with 4 spaces

Returns:

  • (String)

    indented string



485
486
487
# File 'lib/searchlink/string.rb', line 485

def code_indent
  split(/\n/).map { |l| "    #{l}" }.join("\n")
end

#distance(t) ⇒ Object



398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
# File 'lib/searchlink/string.rb', line 398

def distance(t)
  s = self.dup
  m = s.length
  n = t.length
  return m if n == 0
  return n if m == 0
  d = Array.new(m+1) {Array.new(n+1)}

  (0..m).each {|i| d[i][0] = i}
  (0..n).each {|j| d[0][j] = j}
  (1..n).each do |j|
    (1..m).each do |i|
      d[i][j] = if s[i-1] == t[j-1]  # adjust index into string
                  d[i-1][j-1]       # no operation required
                else
                  [ d[i-1][j]+1,    # deletion
                    d[i][j-1]+1,    # insertion
                    d[i-1][j-1]+1,  # substitution
                  ].min
                end
    end
  end
  d[m][n]
end

#fix_gist_filedescription_of_the_return_value

Convert file-myfile-rb to myfile.rb

Returns:

  • (description_of_the_return_value)


91
92
93
# File 'lib/searchlink/string.rb', line 91

def fix_gist_file
  sub(/^file-/, '').sub(/-([^\-]+)$/, '.\1')
end

#matches_all(terms) ⇒ Object

Test that self matches every word in terms

Parameters:

  • terms (String)

    The terms to test



460
461
462
463
464
# File 'lib/searchlink/string.rb', line 460

def matches_all(terms)
  rx_terms = terms.is_a?(String) ? terms.to_rx_array : terms
  rx_terms.each { |rx| return false unless gsub(/[^a-z0-9 ]/i, '') =~ rx }
  true
end

#matches_any(terms) ⇒ Object

Test if self contains any of terms

Parameters:

  • terms (String)

    The terms to test



449
450
451
452
453
# File 'lib/searchlink/string.rb', line 449

def matches_any(terms)
  rx_terms = terms.is_a?(String) ? terms.to_rx_array : terms
  rx_terms.each { |rx| return true if gsub(/[^a-z0-9 ]/i, '') =~ rx }
  false
end

#matches_exact(string) ⇒ Object

Test if self contains exactl match for string (case insensitive)

Parameters:

  • string (String)

    The string to match



428
429
430
431
# File 'lib/searchlink/string.rb', line 428

def matches_exact(string)
  comp = gsub(/[^a-z0-9 ]/i, '')
  comp =~ /\b#{string.gsub(/[^a-z0-9 ]/i, '').split(/ +/).map { |s| Regexp.escape(s) }.join(' +')}/i
end

#matches_fuzzy(terms, separator: ' ', start_word: true, threshhold: 5) ⇒ Object



384
385
386
387
388
389
390
391
392
393
394
395
396
# File 'lib/searchlink/string.rb', line 384

def matches_fuzzy(terms, separator: ' ', start_word: true, threshhold: 5)
  sources = split(/(#{separator})+/)
  words = terms.split(/(#{separator})+/)
  matches = 0
  sources.each do |src|
    words.each do |term|
      d = src.distance(term)
      matches += 1 if d <= threshhold
    end
  end

  ((matches / words.count.to_f) * 10).round(3)
end

#matches_none(terms) ⇒ Object

Test that self does not contain any of terms

Parameters:

  • terms (String)

    The terms to test



438
439
440
441
442
# File 'lib/searchlink/string.rb', line 438

def matches_none(terms)
  rx_terms = terms.is_a?(String) ? terms.to_rx_array : terms
  rx_terms.each { |rx| return false if gsub(/[^a-z0-9 ]/i, '') =~ rx }
  true
end

#matches_score(terms, separator: ' ', start_word: true) ⇒ Object

Score string based on number of matches, 0 - 10

Parameters:

  • terms (String)

    The terms to match

  • separator (String) (defaults to: ' ')

    The word separator

  • start_word (Boolean) (defaults to: true)

    Require match to be at beginning of word



371
372
373
374
375
376
377
378
379
380
381
382
# File 'lib/searchlink/string.rb', line 371

def matches_score(terms, separator: ' ', start_word: true)
  matched = 0
  regexes = terms.to_rx_array(separator: separator, start_word: start_word)

  regexes.each do |rx|
    matched += 1 if self =~ rx
  end

  return 0 if matched.zero?

  ((matched / regexes.count.to_f) * 10).round(3)
end

#nil_if_missingNil, String

Test an AppleScript response, substituting nil for ‘Missing Value’

Returns:

  • (Nil, String)

    nil if string is “missing value”



356
357
358
359
360
# File 'lib/searchlink/string.rb', line 356

def nil_if_missing
  return nil if self =~ /missing value/

  self
end

#normalize_triggerString

Adds ?: to any parentheticals in a regular expression to avoid match groups

Returns:

  • (String)

    modified regular expression



32
33
34
# File 'lib/searchlink/string.rb', line 32

def normalize_trigger
  gsub(/\((?!\?:)/, '(?:').gsub(/(^(\^|\\A)|(\$|\\Z)$)/, '').downcase
end

#parse_flagsObject

parse command line flags into long options



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/searchlink/string.rb', line 54

def parse_flags
  gsub(/(\+\+|--)([dirtvs]+)\b/) do
    m = Regexp.last_match
    bool = m[1] == '++' ? '' : 'no-'
    output = ' '
    m[2].split('').each do |arg|
      output += case arg
                when 'd'
                  "--#{bool}debug "
                when 'i'
                  "--#{bool}inline "
                when 'r'
                  "--#{bool}prefix_random "
                when 't'
                  "--#{bool}include_titles "
                when 'v'
                  "--#{bool}validate_links "
                when 's'
                  "--#{bool}remove_seo "
                else
                  ''
                end
    end

    output
  end.gsub(/ +/, ' ')
end

#parse_flags!Object



82
83
84
# File 'lib/searchlink/string.rb', line 82

def parse_flags!
  replace parse_flags
end

#path_elementsArray

Extract the most relevant portions from a URL path

Returns:

  • (Array)

    array of relevant path elements



156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/searchlink/string.rb', line 156

def path_elements
  path = url_path
  # force trailing slash
  path.sub!(%r{/?$}, '/')
  # remove last path element
  path.sub!(%r{/[^/]+[.\-][^/]+/$}, '')
  # remove starting/ending slashes
  path.gsub!(%r{(^/|/$)}, '')
  # split at slashes, delete sections that are shorter
  # than 5 characters or only consist of numbers
  path.split(%r{/}).delete_if { |section| section =~ /^\d+$/ || section.length < 5 }
end

#remove_entitiesObject



6
7
8
# File 'lib/searchlink/curl/html.rb', line 6

def remove_entities
  gsub(/&nbsp;/, ' ')
end

#remove_protocolString

Remove the protocol from a URL

Returns:

  • (String)

    just hostname and path of URL



139
140
141
# File 'lib/searchlink/string.rb', line 139

def remove_protocol
  sub(%r{^(https?|s?ftp|file)://}, '')
end

#remove_seo(url) ⇒ String

Remove SEO elements from a title

Parameters:

  • url

    The url of the page from which the title came

Returns:



231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# File 'lib/searchlink/string.rb', line 231

def remove_seo(url)
  title = dup
  url = URI.parse(url)
  host = url.hostname
  unless host
    return self unless SL.config['debug']

    SL.add_error('Invalid URL', "Could not remove SEO for #{url}")
    return self

  end

  path = url.path
  root_page = path =~ %r{^/?$} ? true : false

  title.gsub!(/\s*(&ndash;|&mdash;)\s*/, ' - ')
  title.gsub!(/&[lr]dquo;/, '"')
  title.gsub!(/&[lr]dquo;/, "'")
  title.gsub!(/&#8211;/, ' — ')
  title = CGI.unescapeHTML(title)
  title.gsub!(/ +/, ' ')

  seo_title_separators = %w[| » « — – - · :]

  begin
    re_parts = []

    host_parts = host.sub(/(?:www\.)?(.*?)\.[^.]+$/, '\1').split(/\./).delete_if { |p| p.length < 3 }
    h_re = !host_parts.empty? ? host_parts.map { |seg| seg.downcase.split(//).join('.?') }.join('|') : ''
    re_parts.push(h_re) unless h_re.empty?

    # p_re = path.path_elements.map{|seg| seg.downcase.split(//).join('.?') }.join('|')
    # re_parts.push(p_re) if p_re.length > 0

    site_re = "(#{re_parts.join('|')})"

    dead_switch = 0

    while title.downcase.gsub(/[^a-z]/i, '') =~ /#{site_re}/i

      break if dead_switch > 5

      seo_title_separators.each_with_index do |sep, i|
        parts = title.split(/ *#{Regexp.escape(sep)} +/)

        next if parts.length == 1

        remaining_separators = seo_title_separators[i..].map { |s| Regexp.escape(s) }.join('')
        seps = Regexp.new("^[^#{remaining_separators}]+$")

        longest = parts.longest_element.strip

        unless parts.empty?
          parts.delete_if do |pt|
            compressed = pt.strip.downcase.gsub(/[^a-z]/i, '')
            compressed =~ /#{site_re}/ && pt =~ seps ? !root_page : false
          end
        end

        title = if parts.empty?
                  longest
                elsif parts.length < 2
                  parts.join(sep)
                elsif parts.length > 2
                  parts.longest_element.strip
                else
                  parts.join(sep)
                end
      end
      dead_switch += 1
    end
  rescue StandardError => e
    return self unless SL.config['debug']

    SL.add_error("Error SEO processing title for #{url}", e)
    return self
  end

  seps = Regexp.new(" *[#{seo_title_separators.map { |s| Regexp.escape(s) }.join('')}] +")
  if title =~ seps
    seo_parts = title.split(seps)
    title = seo_parts.longest_element.strip if seo_parts.length.positive?
  end

  title && title.length > 5 ? title.gsub(/\s+/, ' ') : CGI.unescapeHTML(self)
end

#remove_seo!(url) ⇒ Object

Destructively remove SEO elements from a title

Parameters:

  • url

    The url of the page from which the title came

See Also:



220
221
222
# File 'lib/searchlink/string.rb', line 220

def remove_seo!(url)
  replace remove_seo(url)
end

#scrubupObject

Scrub invalid characters from string



5
6
7
# File 'lib/searchlink/string.rb', line 5

def scrubup
  encode('utf-16', invalid: :replace).encode('utf-8').gsub(/\u00A0/, ' ')
end

#scrubup!Object

See Also:

  • #scrub


10
11
12
# File 'lib/searchlink/string.rb', line 10

def scrubup!
  replace scrub
end

#slugifyString

Turn a string into a slug, removing spaces and non-alphanumeric characters

Returns:

  • (String)

    slugified string



100
101
102
# File 'lib/searchlink/string.rb', line 100

def slugify
  downcase.gsub(/[^a-z0-9_]/i, '-').gsub(/-+/, '-').sub(/-?$/, '')
end

#slugify!Object

Destructive slugify

See Also:



106
107
108
# File 'lib/searchlink/string.rb', line 106

def slugify!
  replace slugify
end

#spacerString

Generate a spacer based on character widths for help dialog display

Returns:

  • (String)

    string containing tabs



41
42
43
44
45
46
47
48
49
50
51
# File 'lib/searchlink/string.rb', line 41

def spacer
  len = length
  scan(/[mwv]/).each { len += 1 }
  scan(/t/).each { len -= 1 }
  case len
  when 0..3
    "\t\t"
  when 4..12
    " \t"
  end
end

#split_hookObject



6
7
8
9
10
11
12
13
# File 'lib/searchlink/searches/hook.rb', line 6

def split_hook
  elements = split(/\|\|/)
  {
    name: elements[0].nil_if_missing,
    url: elements[1].nil_if_missing,
    path: elements[2].nil_if_missing
  }
end

#split_hooksObject



15
16
17
# File 'lib/searchlink/searches/hook.rb', line 15

def split_hooks
  split(/\^\^/).map(&:split_hook)
end

#to_amString

convert itunes to apple music link

Returns:

  • (String)

    apple music link



127
128
129
130
131
132
# File 'lib/searchlink/string.rb', line 127

def to_am
  input = dup
  input.sub!(%r{/itunes\.apple\.com}, 'geo.itunes.apple.com')
  append = input =~ %r{\?[^/]+=} ? '&app=music' : '?app=music'
  input + append
end

#to_rx_array(separator: ' ', start_word: true) ⇒ Array

Break a string into an array of Regexps

Parameters:

  • separator (String) (defaults to: ' ')

    The word separator

  • start_word (Boolean) (defaults to: true)

    Require matches at start of word

Returns:

  • (Array)

    array of regular expressions



475
476
477
478
479
# File 'lib/searchlink/string.rb', line 475

def to_rx_array(separator: ' ', start_word: true)
  bound = start_word ? '\b' : ''
  str = gsub(/(#{separator})+/, separator)
  str.split(/#{separator}/).map { |arg| /#{bound}#{arg.gsub(/[^a-z0-9]/i, '.?')}/i }
end

#truncate(max) ⇒ Object

Truncate string to given length, preserving words

Parameters:

  • max (Number)

    The maximum length



334
335
336
337
338
339
340
341
342
343
344
345
346
347
# File 'lib/searchlink/string.rb', line 334

def truncate(max)
  return self if length < max

  trunc_title = []

  words = split(/\s+/)
  words.each do |word|
    break unless trunc_title.join(' ').length.close_punctuation + word.length <= max

    trunc_title << word
  end

  trunc_title.empty? ? words[0] : trunc_title.join(' ')
end

#truncate!(max) ⇒ Object

Truncate in place

Parameters:

  • max (Number)

    The maximum length

See Also:



325
326
327
# File 'lib/searchlink/string.rb', line 325

def truncate!(max)
  replace truncate(max)
end

#url_decodeObject



22
23
24
# File 'lib/searchlink/string.rb', line 22

def url_decode
  CGI.unescape(self)
end

#url_encodeString

URL Encode string

Returns:

  • (String)

    url encoded string



18
19
20
# File 'lib/searchlink/string.rb', line 18

def url_encode
  ERB::Util.url_encode(gsub(/%22/, '"'))
end

#url_pathString

Return just the path of a URL

Returns:



148
149
150
# File 'lib/searchlink/string.rb', line 148

def url_path
  URI.parse(self).path
end

#valid_version?Boolean

Test if given string is a valid semantic version number with major, minor and patch (and optionally pre)

Returns:

  • (Boolean)

    string is semantic version number



37
38
39
40
# File 'lib/searchlink/semver.rb', line 37

def valid_version?
  pattern = /^\d+\.\d+\.\d+(-?([^0-9]+\d*))?$/
  self =~ pattern ? true : false
end