ParseMaster.pm 7.16 KB
Newer Older
Motohisa Nakano committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
#ParseMaster (July 25 2005)
#  Based on "ParseMaster.js" by Dean Edwards <http://dean.edwards.name/>
#  Ported to Perl by Rob Seiler, ELR Software Pty Ltd <http://www.elr.com.au>
#  Copyright 2005. License <http://creativecommons.org/licenses/LGPL/2.1/>

package ParseMaster;
use strict;
use Data::Dumper;

# Package wide variable declarations
use vars qw/$VERSION
            @_X_escaped @_X_patterns
           /;

$VERSION    = '017';

# constants
my $X_EXPRESSION  = 0;
my $X_REPLACEMENT = 1;
my $X_LENGTH      = 2;

# re's used to determine nesting levels
my $X_GROUPS      = qr/\(/o;                # NB: Requires g modifier!
my $X_SUB_REPLACE = qr/\$\d/o;
my $X_INDEXED     = qr/^\$\d+$/o;
my $XX_ESCAPE     = qr/\\./o;               # NB: Requires g modifier!
my $XX_DELETED    = qr/\001[^\001]*\001/o;  # NB: Requires g modifier!
my $DIGIT         = qr/[^\D]/o;             # Yep - this is a digit - contains no non-digits

# Constructor
sub new {
  my $class = shift;
  my $self  = {};
  @_X_escaped  = ();  # Re-initialize global for each instance
  @_X_patterns = ();  # Re-initialize global for each instance
  # Instance variables - access by similarly named set/get functions
  $self->{_ignoreCase_} = 0;
  $self->{_escapeChar_} = '';
  bless ($self, $class);
  return $self;
}

sub ignoreCase {
  my ($self, $value) = @_;
  if (defined($value)) {
    $self->{_ignoreCase_} = $value;
  }
  return $self->{_ignoreCase_};
}

sub escapeChar{
  my ($self, $value) = @_;
  if (defined($value)) {
    $self->{_escapeChar_} = $value;
  }
  return $self->{_escapeChar_};
}

#######################
# Public Parsemaster functions

my $X_DELETE = sub(@$) {
  my $X_offset = pop;
  my @X_match = @_;
  return (chr(001) . $X_match[$X_offset] . chr(001));
}; # NB semicolon required for closure!

# create and add a new pattern to the patterns collection
sub add {
  my ($self, $expression, $X_replacement) = @_;
  if (!$X_replacement) {$X_replacement = $X_DELETE};

  # count the number of sub-expressions
  my $temp = &_X_internalEscape($expression);
  my $length  = 1; # Always at least one because each pattern is itself a sub-expression
     $length += $temp =~ s/$X_GROUPS//g; # One way to count the left capturing parentheses in the regexp string

  # does the pattern deal with sub-expressions?
  if ((ref($X_replacement) ne "CODE") && ($X_replacement =~ m/$X_SUB_REPLACE/)) {
    if ($X_replacement =~ m/$X_INDEXED/) { # a simple lookup? (eg "$2")
      # store the index (used for fast retrieval of matched strings)
      $X_replacement = substr($X_replacement,1) - 1;
    }
    else { # a complicated lookup (eg "Hello $2 $1")
      my $i = $length;
      while ($i) { # Had difficulty getting Perl to do Dean's splitting and joining of strings containing $'s
        my $str = '$a[$o+' . ($i-1) . ']'; # eg $a[$o+1]
        $X_replacement =~ s/\$$i/$str/;      # eg $2 $3 -> $a[$o+1] $a[$o+2]
        $i--;
      }
      # build a function to do the lookup - returns interpolated string of array lookups
      $X_replacement = eval('sub {my $o=pop; my @a=@_; return "' . $X_replacement . '"};');
    }
  }
  else {}
  # pass the modified arguments
  &_X_add($expression || q/^$/, $X_replacement, $length);
}

# execute the global replacement
sub exec {
#print Dumper(@_X_patterns);
  my ($self, $X_string) = @_;
  my $escChar    = $self->escapeChar();
  my $ignoreCase = $self->ignoreCase();
  my ($regexp,$captures) = &_getPatterns();  # Concatenated and parenthesized regexp eg '(regex1)|(regex2)|(regex3)' etc
  $X_string = &_X_escape($X_string, $escChar);
  if ($ignoreCase) {$X_string =~ s/$regexp/{&_X_replacement(&_matchVars($captures,\$X_string))}/gie} # Pass $X_String as a
    else           {$X_string =~ s/$regexp/{&_X_replacement(&_matchVars($captures,\$X_string))}/ge}  # reference for speed

  $X_string = &_X_unescape($X_string, $escChar);
  $X_string =~ s/$XX_DELETED//g;
  return $X_string;
}

sub _X_add {
  push (@_X_patterns, [@_]); # Save each argument set as is into an array of arrays
}

# this is the global replace function (it's quite complicated)
sub _X_replacement {
  my (@arguments) = @_;
#print Dumper (@arguments);
  if ($arguments[0] le '') {return ''}
  # Dereference last index (source String) here - faster than in _matchVars (maybe not needed at all?)
  $arguments[$#arguments] = ${$arguments[$#arguments]};
  my $i = 1;
  # loop through the patterns
  for (my $j=0; $j<scalar(@_X_patterns); $j++) { # Loop through global all @_X_patterns
    my @X_pattern = @{$_X_patterns[$j]};
    # do we have a result? NB: "if ($arguments[$i])" as in Dean's Javascript is false for the value 0!!!
    if ((defined $arguments[$i]) && ($arguments[$i] gt '')) {
      my $X_replacement = $X_pattern[$X_REPLACEMENT];
      # switch on type of $replacement
      if (ref($X_replacement) eq "CODE") {     # function
        return &$X_replacement(@arguments,$i);
      }
      elsif ($X_replacement =~ m/$DIGIT/) {    # number (contains no non-digits)
        return $arguments[$X_replacement + $i];
      }
      else { # default
        return $X_replacement;                 # default
      }
    } # skip over references to sub-expressions
    else {$i += $X_pattern[$X_LENGTH]}
  }
}

#######################
# Private functions
#######################

# encode escaped characters
sub _X_escape {
  my ($X_string, $X_escapeChar) = @_;
  if ($X_escapeChar) {
    my $re = '\\'.$X_escapeChar.'(.)';
    $X_string =~ s/$re/{push(@_X_escaped,$1); $X_escapeChar}/ge;
  }
  return $X_string;
}

# decode escaped characters
sub _X_unescape {
  my ($X_string, $X_escapeChar) = @_;
  if ($X_escapeChar) { # We'll only do this if there is an $X_escapeChar!
    my $re = '\\'.$X_escapeChar;
    $X_string =~ s/$re/{$X_escapeChar . (shift(@_X_escaped))}/ge; # Don't use Dean Edwards as below 'or' here - because zero will return ''!
  # $X_string =~ s/$re/{$X_escapeChar . (shift(@_X_escaped) || '')}/ge;
  }
  return $X_string;
}

sub _X_internalEscape {
  my ($string) = shift;
  $string =~ s/$XX_ESCAPE//g;
  return $string;
}

# Builds an array of match variables to (approximately) emulate that available in Javascript String.replace()
sub _matchVars {
  my ($m,$sref) = @_;
  my @args = (1..$m);                # establish the number potential memory variables
  my @mv = map {eval("\$$_")} @args; # matchvarv[1..m] = the memory variables $1 .. $m
  unshift (@mv, $&);                 # matchvar[0]     = the substring that matched
  push    (@mv, length($`));         # matchvar[m+1]   =  offset within the source string where the match occurred (= length of prematch string)
  push    (@mv, $sref);              # matchvar[m+2]   = reference to full source string (dereference in caller if/when needed)
#print Dumper (@mv);
  return @mv;
}

sub _getPatterns {
  my @Patterns = ();
  my $lcp = 0;
  for (my $i=0; $i<scalar(@_X_patterns); $i++) {       # Loop through global all @_patterns
    push (@Patterns, $_X_patterns[$i][$X_EXPRESSION]); # accumulate the expressions
    $lcp += $_X_patterns[$i][$X_LENGTH];               # sum the left capturing parenthesis counts
  }
  my $str = "(" . join(')|(',@Patterns). ")";          # enclose each pattern in () separated by "|"
  return ($str, $lcp);
}

##################
# END            #
##################
1; # ParseMaster #
##################