Module: MaRuKu::Strings

Included in:: In::Markdown::BlockLevelParser, In::Markdown::BlockLevelParser::LineSource, In::Markdown::SpanLevelParser::CharSourceManual, In::Markdown::SpanLevelParser::CharSourceStrscan, In::Markdown::SpanLevelParser::HTMLHelper, In::Markdown::SpanLevelParser::SpanContext, MDElement, String

Defined in:: lib/maruku/ext/yaml.rb,
lib/maruku.rb,
lib/maruku/string_utils.rb,
lib/maruku/input/type_detection.rb

Overview

Utility functions for dealing with strings.

Constant Summary collapse

TAB_SIZE =

AttributeDefinitionList = $1 = id $2 = attribute list

/^\s{0,3}\{([\w\d\s]+)\}:\s*(.*?)\s*$/

InlineAttributeList =

/^\s{0,3}\{([:#\.].*?)\}\s*$/

Definition = Example: ^:blah blah ^: blah blah ^ : blah blah

%r{ 
  ^ # begin of line
  [ ]{0,3} # up to 3 spaces
  : # colon
  \s* # whitespace
  (\S.*) # the text    = $1
  $ # end of line
}x

Abbreviation = Example: *[HTML]: Hyper Text Markup Language

%r{
  ^  # begin of line
  [ ]{0,3} # up to 3 spaces
  \* # one asterisk
  \[ # opening bracket
  ([^\]]+) # any non-closing bracket:  id = $1
  \] # closing bracket
  :  # colon
  \s* # whitespace
  (\S.*\S)* #           definition=$2
  \s* # strip this whitespace
  $   # end of line
}x

FootnoteText =

%r{
  ^  # begin of line
  [ ]{0,3} # up to 3 spaces
  \[(\^.+)\]: # id = $1 (including '^')
  \s*(\S.*)?$    # text = $2 (not obb.)
}x

LinkRegex = This regex is taken from BlueCloth sources Link defs are in the form: ^[id]: n? url “optional title”

%r{
    ^[ ]{0,3}\[([^\[\]]+)\]:    # id = $1
 [ ]*
    <?([^>\s]+)>?       # url = $2
 [ ]*
    (?:# Titles are delimited by "quotes" or (parens).
["(']
(.+?)     # title = $3
[")']     # Matching ) or "
\s*(.+)?   # stuff = $4
    )?  # title is optional
}x

IncompleteLink =

%r{^[ ]{0,3}\[([^\[\]]+?)\]:\s*$}

HeaderWithId =

/^(.*?)\{\#([\w_-]+)\}\s*$/

HeaderWithAttributes =

/^(.*?)\{(.*?)\}\s*$/

MightBeTableHeader = if contains a pipe, it could be a table header

%r{\|}

Sep = ————-:

/\s*(\:)?\s*-+\s*(\:)?\s*/

TableSeparator = | ————-:| —————————— |

%r{^(\|?#{Sep}\|?)+?\s*$}

EMailAddress =

/<([^:]+?@[^:]+?)>/

Instance Method Summary collapse

#line_md_type(l) ⇒ Object
#number_of_leading_spaces(s) ⇒ Fixnum

Returns the number of leading spaces, considering that a tab counts as TAB_SIZE spaces.
#parse_email_headers(s) ⇒ Symbol => String

Parses email headers, returning a hash.
#parse_yaml_headers(s) ⇒ Symbol => String

Parses yaml headers, returning a hash.
#sanitize_ref_id(s) ⇒ String

Replace spaces with underscores and remove non-word characters.
#shellescape(str) ⇒ String

Escapes a string so that it can be safely used in a Bourne shell command line.
#spaces_before_first_char(s) ⇒ Fixnum

This returns the position of the first non-list character in a list item.
#split_lines(s) ⇒ String

Split a string into multiple lines, on line feeds and/or carriage returns.
#strip_indent(s, n) ⇒ String

Removes indentation from the beginning of ‘s`, up to at most `n` spaces.
#unquote(s) ⇒ String

Remove line-initial ‘>` characters for a quotation.

Instance Method Details

#line_md_type(l) ⇒ `Object`

# File 'lib/maruku/input/type_detection.rb', line 36

def line_md_type(l)
  # The order of evaluation is important (:text is a catch-all)
  return :text   if l =~ /^[a-zA-Z]/
  return :code             if number_of_leading_spaces(l)>=4
  return :empty    if l =~ /^\s*$/
  return :footnote_text    if l =~ FootnoteText
  return :ref_definition   if l =~ LinkRegex or l=~ IncompleteLink
  return :abbreviation     if l =~ Abbreviation
  return :definition       if l =~ Definition
  # I had a bug with emails and urls at the beginning of the 
  # line that were mistaken for raw_html
  return :text if l=~ /^[ ]{0,3}#{EMailAddress}/
  return :text if l=~ /^[ ]{0,3}<http:/
  # raw html is like PHP Markdown Extra: at most three spaces before
  return :xml_instr if l =~ %r{^\s*<\?}
  return :raw_html if l =~ %r{^[ ]?[ ]?[ ]?</?\s*\w+}
  return :raw_html if l =~ %r{^[ ]?[ ]?[ ]?<\!\-\-}
  # Something is wrong with how we parse lists! :-(
  #return :ulist    if l =~ /^[ ]{0,3}([\*\-\+])\s+.*\w+/
  #return :olist    if l =~ /^[ ]{0,3}\d+\..*\w+/
  return :ulist    if l =~ /^[ ]{0,1}([\*\-\+])\s+.*\w+/
  return :olist    if l =~ /^[ ]{0,1}\d+\..*\w+/
  return :header1  if l =~ /^(=)+/ 
  return :header2  if l =~ /^([-\s])+$/ 
  return :header3  if l =~ /^(#)+\s*\S+/ 
  # at least three asterisks on a line, and only whitespace
  return :hrule    if l =~ /^(\s*\*\s*){3,1000}$/ 
  return :hrule    if l =~ /^(\s*-\s*){3,1000}$/ # or hyphens
  return :hrule    if l =~ /^(\s*_\s*){3,1000}$/ # or underscores  
  return :quote    if l =~ /^>/
  return :metadata if l =~ /^@/
#   if @@new_meta_data?
    return :ald   if l =~ AttributeDefinitionList
    return :ial   if l =~ InlineAttributeList
#   end
#   return :equation_end if l =~ EquationEnd
  return :text # else, it's just text
end

#number_of_leading_spaces(s) ⇒ `Fixnum`

Returns the number of leading spaces, considering that a tab counts as TAB_SIZE spaces.

Parameters:

s (String)

Returns:

(Fixnum)

# File 'lib/maruku/string_utils.rb', line 71

def number_of_leading_spaces(s)
  spaces = s.scan(/^\s*/).first
  spaces.count(" ") + spaces.count("\t") * TAB_SIZE
end

#parse_email_headers(s) ⇒ `Symbol => String`

Parses email headers, returning a hash. ‘hash` is the message; that is, anything past the headers.

Keys are downcased and converted to symbols; spaces become underscores. For example:

My key: true

becomes:

{:my_key => true}

Parameters:

s (String) —

The email

Returns:

(Symbol => String) —

The header values

# File 'lib/maruku/string_utils.rb', line 53

def parse_email_headers(s)
  headers = {}
  scanner = StringScanner.new(s)

  while scanner.scan(/(\w[\w\s\-]+): +(.*)\n/)
    k, v = normalize_key_and_value(scanner[1], scanner[2])
    headers[k.to_sym] = v
  end

  headers[:data] = scanner.rest
  headers
end

#parse_yaml_headers(s) ⇒ `Symbol => String`

Parses yaml headers, returning a hash. ‘hash` is the message; that is, anything past the headers.

Keys are downcased and converted to symbols; spaces become underscores. For example:

My key: true

becomes:

=> true

Parameters:

s (String) —

the entire contents

Returns:

(Symbol => String) —

The header values

# File 'lib/maruku/ext/yaml.rb', line 23

def parse_yaml_headers(s)
  headers = {}
  if s =~ /^(---\s*\n.*?\n?)^(---\s*$\n?)/m
    
    begin
      hash = YAML.load($1)
    rescue => e
      puts "YAML Exception reading #{name}: #{e.message}"
      hash = {}
    end
  end
  hash.each_pair do |yamlkey,yamlval| 
    k, v = normalize_key_and_value(yamlkey, yamlval)
    headers[k.to_sym] = v
  end
    
  headers[:data] = $' # the postmatch string
  headers
end

#sanitize_ref_id(s) ⇒ `String`

Replace spaces with underscores and remove non-word characters.

Parameters:

s (String)

Returns:

(String)



105
106
107

# File 'lib/maruku/string_utils.rb', line 105

def sanitize_ref_id(s)
  s.strip.downcase.gsub(' ', '_').gsub(/[^\w]/, '')
end

#shellescape(str) ⇒ `String`

Escapes a string so that it can be safely used in a Bourne shell command line.

Note that a resulted string should be used unquoted and is not intended for use in double quotes nor in single quotes.

This is a copy of the Shellwords.shellescape function in Ruby 1.8.7. It’s included for Ruby 1.8.6 compatibility.

Parameters:

str (String)

Returns:

(String)

# File 'lib/maruku/string_utils.rb', line 146

def shellescape(str)
  # An empty argument will be skipped, so return empty quotes.
  return "''" if str.empty?

  str = str.dup

  # Process as a single byte sequence because not all shell
  # implementations are multibyte aware.
  str.gsub!(/([^A-Za-z0-9_\-.,:\/@\n])/n, "\\\\\\1")

  # A LF cannot be escaped with a backslash because a backslash + LF
  # combo is regarded as line continuation and simply ignored.
  str.gsub!(/\n/, "'\n'")

  return str
end

#spaces_before_first_char(s) ⇒ `Fixnum`

This returns the position of the first non-list character in a list item.

spaces_before_first_char(‘*Hello’) #=> 1 spaces_before_first_char(‘* Hello’) #=> 2 spaces_before_first_char(‘ * Hello’) #=> 3 spaces_before_first_char(‘ * Hello’) #=> 5 spaces_before_first_char(‘1.Hello’) #=> 2 spaces_before_first_char(‘ 1. Hello’) #=> 5

Parameters:

s (String)

Returns:

(Fixnum)

# File 'lib/maruku/string_utils.rb', line 89

def spaces_before_first_char(s)
  match = 
    case s.md_type
    when :ulist; s.match(/\s*.(\s*\{(.*?)\})?\s*/)
    when :olist; s.match(/s*\d+.(\s*\{(.*?)\})?\s*/)
    else
      tell_user "MARUKU BUG: '#{s.inspect}' is not a list"
      nil
    end
  match ? [match.end(0), match[0]] : [0, nil]
end

#split_lines(s) ⇒ `String`

Split a string into multiple lines, on line feeds and/or carriage returns.

Parameters:

s (String)

Returns:

(String)



33
34
35

# File 'lib/maruku/string_utils.rb', line 33

def split_lines(s)
  s.split(/\r\n|\r|\n/)
end

#strip_indent(s, n) ⇒ `String`

Removes indentation from the beginning of ‘s`, up to at most `n` spaces. Tabs are counted as TAB_SIZE spaces.

Parameters:

s (String)
n (Fixnum)

Returns:

(String)

# File 'lib/maruku/string_utils.rb', line 124

def strip_indent(s, n)
  while n > 0
    case s[0]
    when ?\s; n -= 1
    when ?\t; n -= TAB_SIZE
    else; return s
    end
    s = s[1..-1]
  end
  return s
end

#unquote(s) ⇒ `String`

Remove line-initial ‘>` characters for a quotation.

Parameters:

s (String)

Returns:

(String)



113
114
115

# File 'lib/maruku/string_utils.rb', line 113

def unquote(s)
  s.gsub(/^>\s?/, '')
end

Module: MaRuKu::Strings

Overview

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#line_md_type(l) ⇒ Object

#number_of_leading_spaces(s) ⇒ Fixnum

#parse_email_headers(s) ⇒ Symbol => String

#parse_yaml_headers(s) ⇒ Symbol => String

#sanitize_ref_id(s) ⇒ String

#shellescape(str) ⇒ String

#spaces_before_first_char(s) ⇒ Fixnum

#split_lines(s) ⇒ String

#strip_indent(s, n) ⇒ String

#unquote(s) ⇒ String

#line_md_type(l) ⇒ `Object`

#number_of_leading_spaces(s) ⇒ `Fixnum`

#parse_email_headers(s) ⇒ `Symbol => String`

#parse_yaml_headers(s) ⇒ `Symbol => String`

#sanitize_ref_id(s) ⇒ `String`

#shellescape(str) ⇒ `String`

#spaces_before_first_char(s) ⇒ `Fixnum`

#split_lines(s) ⇒ `String`

#strip_indent(s, n) ⇒ `String`

#unquote(s) ⇒ `String`