perltest.pl 6.13 KB
Newer Older
1
#! /usr/bin/env perl
eg's avatar
eg committed
2 3

# Program for testing regular expressions with perl to check that PCRE handles
4 5 6 7 8
# them the same. This version needs to have "use utf8" at the start for running
# the UTF-8 tests, but *not* for the other tests. The only way I've found for
# doing this is to cat this line in explicitly in the RunPerlTest script. I've
# also used this method to supply "require Encode" for the UTF-8 tests, so that
# the main test will still run where Encode is not installed.
eg's avatar
eg committed
9

10 11
#use utf8;
#require Encode;
eg's avatar
eg committed
12

13
# Function for turning a string into a string of printing chars.
eg's avatar
eg committed
14 15 16 17 18 19 20 21 22 23

sub pchars {
my($t) = "";

if ($utf8)
  {
  @p = unpack('U*', $_[0]);
  foreach $c (@p)
    {
    if ($c >= 32 && $c < 127) { $t .= chr $c; }
24 25
      else { $t .= sprintf("\\x{%02x}", $c);
      }
eg's avatar
eg committed
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
    }
  }
else
  {
  foreach $c (split(//, $_[0]))
    {
    if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
      else { $t .= sprintf("\\x%02x", ord $c); }
    }
  }

$t;
}


# Read lines from named file or stdin and write to named file or stdout; lines
# consist of a regular expression, in delimiters and optionally followed by
# options, followed by a set of test data, terminated by an empty line.

# Sort out the input and output files

if (@ARGV > 0)
  {
  open(INFILE, "<$ARGV[0]") || die "Failed to open $ARGV[0]\n";
  $infile = "INFILE";
  }
else { $infile = "STDIN"; }

if (@ARGV > 1)
  {
  open(OUTFILE, ">$ARGV[1]") || die "Failed to open $ARGV[1]\n";
  $outfile = "OUTFILE";
  }
else { $outfile = "STDOUT"; }

printf($outfile "Perl $] Regular Expressions\n\n");

# Main loop

NEXT_RE:
for (;;)
  {
  printf "  re> " if $infile eq "STDIN";
  last if ! ($_ = <$infile>);
  printf $outfile "$_" if $infile ne "STDIN";
71
  next if ($_ =~ /^\s*$/ || $_ =~ /^< forbid/);
eg's avatar
eg committed
72 73 74 75 76 77 78 79 80 81 82

  $pattern = $_;

  while ($pattern !~ /^\s*(.).*\1/s)
    {
    printf "    > " if $infile eq "STDIN";
    last if ! ($_ = <$infile>);
    printf $outfile "$_" if $infile ne "STDIN";
    $pattern .= $_;
    }

83 84
  chomp($pattern);
  $pattern =~ s/\s+$//;
eg's avatar
eg committed
85 86 87

  # The private /+ modifier means "print $' afterwards".

88 89 90 91 92
  $showrest = ($pattern =~ s/\+(?=[a-zA-Z]*$)//);

  # A doubled version is used by pcretest to print remainders after captures

  $pattern =~ s/\+(?=[a-zA-Z]*$)//;
eg's avatar
eg committed
93

94
  # Remove /8 from a UTF-8 pattern.
eg's avatar
eg committed
95

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
  $utf8 = $pattern =~ s/8(?=[a-zA-Z]*$)//;

  # Remove /J from a pattern with duplicate names.

  $pattern =~ s/J(?=[a-zA-Z]*$)//;

  # Remove /K from a pattern (asks pcretest to check MARK data) */

  $pattern =~ s/K(?=[a-zA-Z]*$)//;

  # /W asks pcretest to set PCRE_UCP; change this to /u for Perl

  $pattern =~ s/W(?=[a-zA-Z]*$)/u/;

  # Remove /S or /SS from a pattern (asks pcretest to study or not to study)

  $pattern =~ s/S(?=[a-zA-Z]*$)//g;

  # Remove /Y and /O from a pattern (disable PCRE optimizations)

  $pattern =~ s/[YO](?=[a-zA-Z]*$)//;
eg's avatar
eg committed
117 118 119

  # Check that the pattern is valid

120
  eval "\$_ =~ ${pattern}";
eg's avatar
eg committed
121 122 123
  if ($@)
    {
    printf $outfile "Error: $@";
124 125 126 127 128 129 130 131
    if ($infile != "STDIN")
      {
      for (;;)
        {
        last if ! ($_ = <$infile>);
        last if $_ =~ /^\s*$/;
        }
      }
eg's avatar
eg committed
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
    next NEXT_RE;
    }

  # If the /g modifier is present, we want to put a loop round the matching;
  # otherwise just a single "if".

  $cmd = ($pattern =~ /g[a-z]*$/)? "while" : "if";

  # If the pattern is actually the null string, Perl uses the most recently
  # executed (and successfully compiled) regex is used instead. This is a
  # nasty trap for the unwary! The PCRE test suite does contain null strings
  # in places - if they are allowed through here all sorts of weird and
  # unexpected effects happen. To avoid this, we replace such patterns with
  # a non-null pattern that has the same effect.

  $pattern = "/(?#)/$2" if ($pattern =~ /^(.)\1(.*)$/);

  # Read data lines and test them

  for (;;)
    {
    printf "data> " if $infile eq "STDIN";
    last NEXT_RE if ! ($_ = <$infile>);
    chomp;
    printf $outfile "$_\n" if $infile ne "STDIN";

158 159 160
    s/\s+$//;  # Remove trailing space
    s/^\s+//;  # Remove leading space
    s/\\Y//g;  # Remove \Y (pcretest flag to set PCRE_NO_START_OPTIMIZE)
eg's avatar
eg committed
161 162 163 164

    last if ($_ eq "");
    $x = eval "\"$_\"";   # To get escapes processed

165 166
    # Empty array for holding results, ensure $REGERROR and $REGMARK are
    # unset, then do the matching.
eg's avatar
eg committed
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188

    @subs = ();

    $pushes = "push \@subs,\$&;" .
         "push \@subs,\$1;" .
         "push \@subs,\$2;" .
         "push \@subs,\$3;" .
         "push \@subs,\$4;" .
         "push \@subs,\$5;" .
         "push \@subs,\$6;" .
         "push \@subs,\$7;" .
         "push \@subs,\$8;" .
         "push \@subs,\$9;" .
         "push \@subs,\$10;" .
         "push \@subs,\$11;" .
         "push \@subs,\$12;" .
         "push \@subs,\$13;" .
         "push \@subs,\$14;" .
         "push \@subs,\$15;" .
         "push \@subs,\$16;" .
         "push \@subs,\$'; }";

189 190 191
    undef $REGERROR;
    undef $REGMARK;

192
    eval "${cmd} (\$x =~ ${pattern}) {" . $pushes;
eg's avatar
eg committed
193 194 195 196 197 198 199 200

    if ($@)
      {
      printf $outfile "Error: $@\n";
      next NEXT_RE;
      }
    elsif (scalar(@subs) == 0)
      {
201 202 203 204
      printf $outfile "No match";
      if (defined $REGERROR && $REGERROR != 1)
        { printf $outfile (", mark = %s", &pchars($REGERROR)); }
      printf $outfile "\n";
eg's avatar
eg committed
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
      }
    else
      {
      while (scalar(@subs) != 0)
        {
        printf $outfile (" 0: %s\n", &pchars($subs[0]));
        printf $outfile (" 0+ %s\n", &pchars($subs[17])) if $showrest;
        $last_printed = 0;
        for ($i = 1; $i <= 16; $i++)
          {
          if (defined $subs[$i])
            {
            while ($last_printed++ < $i-1)
              { printf $outfile ("%2d: <unset>\n", $last_printed); }
            printf $outfile ("%2d: %s\n", $i, &pchars($subs[$i]));
            $last_printed = $i;
            }
          }
        splice(@subs, 0, 18);
        }
225 226 227 228 229 230 231 232 233 234 235

      # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
      # set and the input pattern was a UTF-8 string. We can, however, force
      # it to be so marked.

      if (defined $REGMARK && $REGMARK != 1)
        {
        $xx = $REGMARK;
        $xx = Encode::decode_utf8($xx) if $utf8;
        printf $outfile ("MK: %s\n", &pchars($xx));
        }
eg's avatar
eg committed
236 237 238 239 240 241 242
      }
    }
  }

# printf $outfile "\n";

# End