Class: Metrocot::TextPattern

Inherits:
BasePattern show all
Defined in:
lib/metrocot.rb

Overview

Matches a certain text string or regex pattern

Instance Attribute Summary

Attributes inherited from BasePattern

#matched, #metrocot, #name, #node_scraper, #pattern_no, #pred, #source, #succ

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from BasePattern

#default_scanner, #dump, #dump_match_map, #log, #log_match_data, #optional, #with_scanned_match_data

Constructor Details

#initialize(source, text) ⇒ TextPattern

Returns a new instance of TextPattern.



565
566
567
568
# File 'lib/metrocot.rb', line 565

def initialize( source, text )
  super(source)
  @text = text
end

Class Method Details

.parse(s) ⇒ Object



574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
# File 'lib/metrocot.rb', line 574

def self.parse( s ) 

  if s.index("$") == 0
    return self.new( "$", /[\r\n]/ )
  end

  if s.index("/") == 0

    r_s = ""
    s = s[1..-1]
    src = "/"
    while !s.empty?

      if s.index("/") == 0
        s = s[1..-1]
        src << "/"
        break
      elsif s.index("\\/") == 0
        s = s[2..-1]
        r_s << "/"
        src << "\\/"
      else
        r_s << s[0..0]
        src << s[0..0]
        s = s[1..-1]
      end

    end

    regexp = Regexp.compile( r_s )
    return self.new( src, regexp )

  end

  if s.index("\"") == 0

    r_s = ""
    s = s[1..-1]
    src = "\""
    while !s.empty?

      if s.index("\"") == 0
        s = s[1..-1]
        src << "\""
        break
      elsif s.index("\\\"") == 0
        s = s[2..-1]
        r_s << "\""
        src << "\\\""
      else
        r_s << s[0..0]
        src << s[0..0]
        s = s[1..-1]
      end

    end

    return self.new( src, r_s )
  end

end

Instance Method Details

#descriptionObject



570
571
572
# File 'lib/metrocot.rb', line 570

def description
  "text \"#{@text}\""
end

#each_match(match_range, match_map) ⇒ Object



652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
# File 'lib/metrocot.rb', line 652

def each_match( match_range, match_map )

  super(match_range, match_map)

  match_index = match_range.start_index
  match_offset = match_range.start_offset


  # consume rest of first text node  

  hnodes = match_range.hnodes

  actual_match = nil

  while match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)

    while (match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)) && ! hnodes[match_index].text?
      log( "not text: ##{match_index} #{hnodes[match_index].class}" )
      match_index += 1
      match_offset = 0
    end

    unless (match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)) && hnodes[match_index].text?
      log( "no match found" )
      return nil
    end

    hnode_text = if match_index == match_range.end_index
      hnodes[match_index].inner_text[0...match_range.end_offset]
    else
      hnodes[match_index].inner_text
    end

    log( "trying text match on: #{hnode_text[match_offset .. -1]}" )

    next_match_offset = hnode_text.index( @text, match_offset )

    if next_match_offset.nil?
      log( "no match found for #{@text}" )
      match_index += 1
      match_offset = 0
      next
    end

    actual_match = if @text.is_a? Regexp
      hnode_text[next_match_offset..-1][@text]
    else
      @text
    end

    log( "next text match at #{match_index}.#{next_match_offset}: #{actual_match}" )

    match_start_offset = next_match_offset
    match_end_offset = match_start_offset + actual_match.size
          
    if match_end_offset >= hnode_text.size
      log( "matched entire string of #{match_end_offset - match_start_offset} chars" )
    else
      log( "matched first #{match_end_offset - match_start_offset} chars" )
    end

    result = with_scanned_match_data( match_map, actual_match ) { |match_map|
      yield( match_range.crop( match_index, match_start_offset, match_index, match_end_offset), match_map ) 
    }
    
    return result if result

    match_offset = match_end_offset

  end

  return nil
  
end

#priorityObject



636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
# File 'lib/metrocot.rb', line 636

def priority
  if name
    if @text.is_a?(String)
      -4
    else
      -5
    end
  else
    if @text.is_a?(String)
      -2
    else
      -3
    end
  end
end