[Okta Webinar] Learn how to a build a cloud-first strategyRegister Now

x
  • Status: Solved
  • Priority: Medium
  • Security: Public
  • Views: 787
  • Last Modified:

Regular expression componenent for delphi 5

does anyone know a free object/component for Regular exporession for delphi 5?

thanx
T.
0
tomer_engel
Asked:
tomer_engel
  • 2
1 Solution
 
DragonSlayerCommented:
Check out TurboPower's SysTool http://sourceforge.net/projects/tpsystools/
0
 
Lee_NoverCommented:
I'm using TRegExpr - http://RegExpStudio.com
0
 
Russell LibbySoftware Engineer, Advisory Commented:
Something I wrote last year. Its free, its fast, and it was written in Delphi5. If nothing else, it should provide a good (clean) example of a finite state machine (see the MatchSubString method for the actual token execution)

Regards,
Russell

----

unit RegExpr;
////////////////////////////////////////////////////////////////////////////////
//
//   Unit        :  REGEXPR
//   Description :  Regular expression parser for Delphi
//   Date        :  10.14.2003
//
//   Description :
//
//   ^        =  Start anchor indicator (if used in first position)
//   $        =  End anchor indicator (must be final character)
//   ( | )    =  Expression OR Expression. Parens start the ORing of two or more
//               sub expressions. For multiple ORing the following syntax can be
//               used: (expr1|expr2|expr3|...) Cannot have post modifiers
//   ^        =  Must be used after [, negates a class
//   .        =  Match any char (cannot be negated, and cannot have post modifiers)
//   *        =  Post modifier, match zero or more occurrences of char/char class
//   ?        =  Post modifier, match zero or one occurences of char/char class
//   +        =  Post modifier, match one or more occurences of char/char class
//   {}       =  Post modifier, allows for the patterns {m}, {m,} and {m, n} where
//               {m} matches exactly m occurrences of the char/char class
//               {m,} matches m occurences or more of the char/char class
//               {m, n} matches m to n occurences of the char / char class
//   []       =  Char class, cannot contain ".", pre/post modifiers, or other meta
//               characters except for "-".
//   -        =  Used to define char ranges in char class
//   @0..9    =  Mark the current bead position (integer value). Useful for seperating
//               sub parts of the expression
//   \        =  Escaped char: The following have special meaning
//               \a    =  Bell
//               \b    =  Backspace
//               \f    =  Formfeed
//               \n    =  Newline
//               \r    =  Carriage return
//               \t    =  Tab
//               \s    =  Space
//               \v    =  Vertical tab
//               All other escaped charactes are translated as literals. This allows
//               the use of what would otherwise be meta characters. (ie: *, ., [, ...)
//   %        =  Extended char class range. Must be trailed by one of the following:
//               %a    =  0-9, A-Z, a-z
//               %c    =  A-Z, a-z
//               %d    =  0-9
//               %h    =  0-9, A-F, a-f
//               %i    =  \_, \-, \$, 0-9, A-Z, a-z
//               %n    =  0-9
//               %w    =  \n, \r, \s, \t
//               Any other trailer will cause an expression error.
//   x        =  All other characters are taken as literal values.
//
////////////////////////////////////////////////////////////////////////////////
interface

{$Z4}
uses
  Windows, SysUtils, Classes;

////////////////////////////////////////////////////////////////////////////////
//   Regular expression exception class
////////////////////////////////////////////////////////////////////////////////
type
  ERegExpr          =  class(Exception);

////////////////////////////////////////////////////////////////////////////////
//   Regular expression data types
////////////////////////////////////////////////////////////////////////////////
type
  PCharSet          =  ^TCharSet;
  TCharSet          =  Set of Char;

////////////////////////////////////////////////////////////////////////////////
//   Regular expression instruction types
////////////////////////////////////////////////////////////////////////////////
type
  TRegType          =  (rtTerminal, rtBead, rtMark, rtMarkOr, rtMarkEnd, rtAnyChar,
                        rtChar, rtClass, rtNegClass);

////////////////////////////////////////////////////////////////////////////////
//   Regular expression post modifier types
////////////////////////////////////////////////////////////////////////////////
type
  TRegPost          =  (rpNone, rpOne, rpZMore, rpZOne, rpOneMore, rpExact, rpCMore, rpCtoC);

////////////////////////////////////////////////////////////////////////////////
//   Regular expression state structure
////////////////////////////////////////////////////////////////////////////////
type
  PRegState         =  ^TRegState;
  TRegState         =  packed record
     regType:       TRegType;
     regPost:       TRegPost;
     dwMin:         Integer;
     dwMax:         Integer;
     case Integer of
        0  :  (lpClass:       PCharSet);
        1  :  (cChar:         Char);
        2  :  (dwJump:        Integer);
  end;

////////////////////////////////////////////////////////////////////////////////
//   Regular expression constants
////////////////////////////////////////////////////////////////////////////////
const
  REG_NILJUMP       =  0;
  REG_NEGCLASS:     Array [False..True] of TRegType = (rtClass, rtNegClass);
  REG_TERMCHARS:    Set of Char = [#0, '$', '(', ')', '|'];
  REG_METACHARS:    Set of Char = ['$', '(', ')', '|', '^', '*', '?', '+', '-',
                                   '@', '[', ']', '.', '{', '}', ',', '%'];
  REG_PARSEERRORS:  Array [0..15] of PChar =
                    ('Expression pattern has not been set',            // 0
                     'Expression pattern cannot be blank',             // 1
                     'Invalid pattern expression',                     // 2
                     'Unexpected end of expression at position (%d)',  // 3
                     'Unexpected meta character at position (%d)',     // 4
                     'Characters after end anchor at position (%d)',   // 5
                     'Invalid special identifier at position (%d)',    // 6
                     'Invalid empty set at position (%d)',             // 7
                     'Min count must be less than max count (%d)',     // 8
                     'Expected a closing brace at position (%d)',      // 9
                     'Exact match count cannot be set to zero (%d)',   // 10
                     'Expected a numeric at position (%d)',            // 11
                     'Invalid numeric expression at position (%d)',    // 12
                     'Expected a pipe at position (%d)',               // 13
                     'Expected a closing paren at position (%d)',      // 14
                     'Bead markers must be bewteen 0 and 9'            // 15
                    );

////////////////////////////////////////////////////////////////////////////////
//   Regular expression class
////////////////////////////////////////////////////////////////////////////////
type
  TRegExpr          =  class(TObject)
  private
     // Private declarations
     FBeadPos:      Array [0..9] of Integer;
     FAncStart:     Boolean;
     FAncEnd:       Boolean;
     FMatchCase:    Boolean;
     FMatched:      Boolean;
     FMatchPos:     Integer;
     FMatchLen:     Integer;
     FTable:        TList;
     FError:        PChar;
     FPos:          PChar;
     FParsed:       Boolean;
     FPattern:      PChar;
     FSource:       PChar;
     FCursor:       PChar;
     FAnchor:       PChar;
  protected
     // Protected declarations
     function       GetSource: PChar;
     function       GetPattern: PChar;
     function       GetPosition: Integer;
     function       GetBeadPos(Index: Integer): Integer;
     procedure      SetSource(Source: PChar);
     procedure      SetPattern(Pattern: PChar);
     procedure      SetMatchCase(CaseSensitive: Boolean);
     procedure      ClearStateTable;
     function       AllocState(StateType: TRegType): PRegState;
     procedure      FreeState(State: PRegState);
     procedure      ParseSubExpression;
     procedure      ParseParens;
     procedure      ParseExpression;
     procedure      ParseAnyChar;
     procedure      ParseCharClass;
     procedure      ParseChar;
     procedure      ParseCharRange(CClass: PCharSet);
     procedure      ParseSpecialRange(CClass: PCharSet);
     function       ParseCCChar: Char;
     function       ParseEscChar: Char;
     function       ParseNum: Integer;
     procedure      ParsePostModifier(State: PRegState);
     procedure      ResetMatched;
     procedure      ResetAll;
     function       Match(Start: Boolean): Boolean;
     function       MatchSubString: Boolean;
     function       ScanLineStart: Boolean;
     procedure      RaiseError(Error: Integer; InsertPos: Boolean = False);
  public
     // Public declarations
     constructor    Create;
     constructor    CreatePattern(Pattern: PChar);
     destructor     Destroy; override;
     function       Test(Pattern: PChar): Boolean;
     function       MatchFirst: Boolean;
     function       MatchNext: Boolean;
     property       BeadPos[Index: Integer]: Integer read GetBeadPos;
     property       CaseSensitive: Boolean read FMatchCase write SetMatchCase;
     property       MatchPos: Integer read FMatchPos;
     property       MatchLen: Integer read FMatchLen;
     property       Matched: Boolean read FMatched;
     property       Pattern: PChar read GetPattern write SetPattern;
     property       Position: Integer read GetPosition;
     property       Source: PChar read GetSource write SetSource;
  end;

implementation

////////////////////////////////////////////////////////////////////////////////
//   TRegExpr
////////////////////////////////////////////////////////////////////////////////
procedure TRegExpr.RaiseError(Error: Integer; InsertPos: Boolean = False);
begin

  // Check to see if we should be formatting this
  if InsertPos then
     // Formatted
     StrFmt(FError, REG_PARSEERRORS[Error], [Succ(FPos-FPattern)])
  else
     // Straight copy
     StrCopy(FError, REG_PARSEERRORS[Error]);

  // Raise the exception with the detailed information
  raise ERegExpr.Create(FError);

end;

function TRegExpr.GetBeadPos(Index: Integer): Integer;
begin

  // Check index range for bead position
  if not(Index in [0..9]) then RaiseError(15);

  // Return the bead marker
  result:=FBeadPos[Index];

end;

function TRegExpr.Test(Pattern: PChar): Boolean;
begin

  // Set the pattern to test
  SetPattern(Pattern);

  // Call MatchFirst and return results
  result:=MatchFirst;

end;

function TRegExpr.GetPosition: Integer;
begin

  // Return the current anchor position within the source text
  result:=FAnchor-FSource;

end;

function TRegExpr.MatchNext: Boolean;
begin

  // Reset matched state
  ResetMatched;

  // Call Match
  result:=Match(False);

end;

function TRegExpr.MatchFirst: Boolean;
begin

  // Reset matched state and source starting position
  ResetAll;

  // Call Match
  result:=Match(True);

end;

function TRegExpr.Match(Start: Boolean): Boolean;
begin

  // Set default result
  result:=False;

  // Check to make sure the expression was parsed
  if FParsed then
  begin
     // Check anchor start
     if FAncStart then
     begin
        // If not the first time then scan to a new line
        if not(Start) then ScanLineStart;
        // Keep looping until match, or cursor is at the end
        while (FCursor^ > #0) do
        begin
           // Search for the pattern
           result:=MatchSubString;
           if result then break;
           // Increment the anchor
           Inc(FAnchor);
           // Scan to the next line
           if not(ScanLineStart) then break;
        end;
     end
     else
     begin
        // Keep looping until match, or cursor is at the end
        while (FCursor^ > #0) do
        begin
           // Search for the pattern
           result:=MatchSubString;
           if result then break;
           // Increment the anchor
           Inc(FAnchor);
           // Set the cursor to the anchor
           FCursor:=FAnchor;
        end;
     end;
     // If we did not succeed then we must perform a full reset
     if result then
     begin
        // Set match pos and length
        FMatched:=True;
        FMatchPos:=FAnchor-FSource;
        FMatchLen:=FCursor-FAnchor;
        // If match length is zero, which can happen when matching zero times, then
        // we may run into cases where the cursor does not get incremented. We need
        // to increment for those cases
        if ((FMatchLen = 0) and (FCursor^ > #0)) then Inc(FCursor);
        // Set the next starting search point
        FAnchor:=FCursor;
     end
     else
        ResetAll;
  end
  else
     // Not parsed
     RaiseError(0);

end;

function TRegExpr.MatchSubString: Boolean;
var  prsNode:    PRegState;
     ppaTable:   PPointerList;
     dwCharLoop: Integer;
     dwIndex:    Integer;
     dwBranch:   Integer;
     lpszHold:   PChar;
     bMatch:     Boolean;
     cChar:      Char;
begin

  // Set default result (optimisitic)
  result:=True;

  // Set node pointer
  prsNode:=nil;

  // Map in the table as a straight pointer array
  ppaTable:=FTable.List;

  // Clear bead positions
  dwIndex:=10;
  repeat
     Dec(dwIndex);
     FBeadPos[dwIndex]:=0;
  until (dwIndex = 0);

  // Set string index and default jump pointers
  lpszHold:=FCursor;
  dwBranch:=REG_NILJUMP;

  // Loop until we are done
  while True do
  begin
     // Get the state record and make sure the index is incremented
     prsNode:=ppaTable^[dwIndex];
     Inc(dwIndex);
     // Check for terminating state or expression break
     case prsNode^.regType of
        // End of compiled expression
        rtTerminal  :  break;
        // Bead handling
        rtBead      :  FBeadPos[prsNode^.dwJump]:=FCursor-FSource;
        // START MARKER
        rtMark      :
        begin
           // Mark the current string and set the branch point
           dwBranch:=prsNode^.dwJump;
           lpszHold:=FCursor;
        end;
        // OR MARKER
        rtMarkOr    :
        begin
           // Check current result
           if result then
           begin
              // Jump out of here
              dwIndex:=prsNode^.dwJump;
              dwBranch:=REG_NILJUMP;
           end
           else
           begin
              // Reset the cursor and set new jump point
              dwBranch:=prsNode^.dwJump;
              FCursor:=lpszHold;
              // Reset the result
              result:=True;
           end;
        end;
        // END MARKER
        rtMarkEnd   :
        begin
           // Clear the jump points
           dwBranch:=REG_NILJUMP;
           // Check result
           if result then Continue;
           // Failure
           Break;
        end;
     else
        // Simple char state loop counter to handle the multiple matching
        dwCharLoop:=1;
        while True do
        begin
           // Perform the character matching (if we have string)
           if (FCursor^ > #0) then
           begin
              if FMatchCase then
                 cChar:=FCursor^
              else
                 cChar:=UpCase(FCursor^);
              case prsNode^.regType of
                 rtAnyChar   :  bMatch:=not(cChar = #10);
                 rtChar      :  bMatch:=(cChar = prsNode^.cChar);
                 rtClass     :  bMatch:=(cChar in prsNode^.lpClass^);
                 rtNegClass  :  bMatch:=not(cChar in prsNode^.lpClass^);
              else
                 // Set match flag to false
                 bMatch:=False;
              end;
              // Increment the cursor position
              Inc(FCursor, Ord(bMatch));
           end
           else
              // Set match flag to false
              bMatch:=False;
           // Handle the post operator
           case prsNode^.regPost of
              // No modifier, or single match
              rpNone,
              rpOne          :
              begin
                 result:=bMatch;
                 break;
              end;
              // Match zero or more times
              rpZMore        :  if not(bMatch) then break;
              // Match zero or one times
              rpZOne         :  break;
              // Match one of more times
              rpOneMore      :
              begin
                 if (dwCharLoop = 1) then result:=bMatch;
                 if not(bMatch) then break;
              end;
              // Match exactly count times
              rpExact        :
              begin
                 result:=bMatch;
                 if not(bMatch) or (dwCharLoop >= prsNode^.dwMin) then break;
              end;
              // Match at least count or more times
              rpCMore        :
              begin
                 if (dwCharLoop <= prsNode^.dwMin) then result:=bMatch;
                 if not(bMatch) then break;
              end;
              // Match at least min times up to max times
              rpCtoC         :
              begin
                 if (dwCharLoop <= prsNode^.dwMin) then result:=bMatch;
                 if not(bMatch) or (dwCharLoop >= prsNode^.dwMax) then break;
              end;
           end;
           // Increment the char loop counter
           Inc(dwCharLoop);
        end;
        // Check for failure
        if not(result) then
        begin
           // If no jump is set then we fail
           if (dwBranch = REG_NILJUMP) then break;
           // Perform the jump
           dwIndex:=dwBranch;
        end;
     end;
  end;

  // Success, we still need to check against the terminal state in case we
  // ran out of string to process
  if result and Assigned(prsNode) then
  begin
     // Make sure we processed the terminal state
     result:=(prsNode^.regType = rtTerminal);
     // Still need to check against an end anchor if one was specified
     if (result and FAncEnd) then result:=(FCursor^ in [#0, #13, #10]);
  end
  else
     result:=False;

end;

function TRegExpr.ScanLineStart: Boolean;
begin

  // Set default result
  result:=False;

  // Need to position the cursor right after a line break
  while not(FCursor^ in [#0, #10]) do Inc(FCursor);

  // If we are at a null then there is no next line so return false
  if (FCursor^ > #0) then
  begin
     // Increment past the line feed
     Inc(FCursor);
     // Reset the anchor to match the cursor
     FAnchor:=FCursor;
     // Success
     result:=True;
  end;

end;

procedure TRegExpr.ParseSubExpression;
var  prsNode:    PRegState;
begin

  // Parse the sub expression until we hit a terminating char
  while not(FPos^ in REG_TERMCHARS) do
  begin
     // Handle the character
     case FPos^ of
        // Any character
        '.'   :  ParseAnyChar;
        // Character class
        '['   :  ParseCharClass;
        // Parse a bead position
        '@'   :
        begin
           // Skip next
           Inc(FPos);
           // Bead must be bewteen 0..9
           if not(FPos^ in ['0'..'9']) then RaiseError(15);
           // Allocate a new state
           prsNode:=AllocState(rtBead);
           prsNode^.dwJump:=Ord(FPos^)-48;
           FTable.Add(prsNode);
           // Skip next
           Inc(FPos);
        end;
        // Parse an extended char class
        '%'   :
        begin
           // Resource protection
           prsNode:=nil;
           try
              // Allocate a new state
              prsNode:=AllocState(rtClass);
              prsNode^.lpClass:=AllocMem(SizeOf(TCharSet));
              // Parse the range and post modifier
              ParseSpecialRange(prsNode^.lpClass);
              ParsePostModifier(prsNode);
              // Add to the state table
              FTable.Add(prsNode);
           except
              // Free the state
              FreeState(prsNode);
              // Re-raise the exception
              raise;
           end;
        end;
     else
        // It is a failure to be sitting at a meta char at this point
        if (FPos^ in REG_METACHARS) then
           RaiseError(4, True)
        else
           // Parse a literal or escaped character
           ParseChar;
     end;
  end;

end;

procedure TRegExpr.ParseParens;
var  prsMark:    PRegState;
     prsMarkOr:  PRegState;
     prsMarkEnd: PRegState;
     dwMark:     Integer;
begin

  // Skip starting paren
  Inc(FPos);

  // Add the mark record (hold pointer so we can fill the jump)
  prsMark:=AllocState(rtMark);
  dwMark:=FTable.Add(prsMark);

  // Parse the sub expression
  ParseSubExpression;

  // Better be sitting at a | delimiter
  if (FPos^ <> '|') then RaiseError(13, True);
  Inc(FPos);

  // Add the mark OR
  prsMarkOr:=AllocState(rtMarkOr);

  // Add to the table and set the mark jump point
  prsMark^.dwJump:=FTable.Add(prsMarkOr);

  // Switch
  prsMark:=prsMarkOr;

  // Parse the second part of the expression
  ParseSubExpression;

  // Keep iterating while we have an OR
  while (FPos^ = '|') do
  begin
     // Push next
     Inc(FPos);
     // Add the subsequent ORs
     prsMarkOr:=AllocState(rtMarkOr);
     // Add to the table and set the mark jump point
     prsMark^.dwJump:=FTable.Add(prsMarkOr);
     // Switch
     prsMark:=prsMarkOr;
     // Parse the subsequent sub expression
     ParseSubExpression;
  end;

  // End the mark
  if (FPos^ <> ')') then RaiseError(14, True);
  Inc(FPos);

  // Allocate the mark end and set loop jump plus post modifiers
  prsMarkEnd:=AllocState(rtMarkEnd);
  prsMarkEnd^.dwJump:=dwMark;

  // Add the end and update the last jump
  prsMark^.dwJump:=FTable.Add(prsMarkEnd);

end;

procedure TRegExpr.ParseExpression;
var  prsNode:       PRegState;
begin

  // Clear the current state table and reparse
  ClearStateTable;

  // Clear match settings
  ResetMatched;

  // Reset expression parser anchors
  FAncStart:=False;
  FAncEnd:=False;

  // Set parser variable
  FPos:=FPattern;

  // Check for null pattern string
  if (FPos^ = #0) then RaiseError(1);

  // Check for starting anchor
  if (FPos^ = '^') then
  begin
     // Set anchor start flag
     FAncStart:=True;
     Inc(FPos);
     // Check for null
     if (FPos^ = #0) then RaiseError(2);
  end
  else
     // Check for just an ending anchor
     if (FPos^ = '$') then RaiseError(2);

  // Catch exceptions so we can clear the state table on failure
  try
     // Parse until we hit an ending char
     while not(FPos^ in [#0, '$']) do
     begin
        case FPos^ of
           '('   :  ParseParens;
           ')'   :  RaiseError(4, True);
           '|'   :  RaiseError(4, True);
        else
           ParseSubExpression;
        end;
     end;
     // Check for tail end anchor
     if (FPos^ = '$') then
     begin
        FAncEnd:=True;
        // Push next
        Inc(FPos);
        // We must be at the null
        if (FPos^ > #0) then RaiseError(5, True);
     end;
     // Add the terminal state
     prsNode:=AllocState(rtTerminal);
     FTable.Add(prsNode);
  except
     // Clear the anchor flags
     FAncStart:=False;
     FAncEnd:=False;
     // Clear the state table
     ClearStateTable;
     // Re-raise the exception again
     raise;
  end;

end;

procedure TRegExpr.ParseChar;
var  prsNode:    PRegState;
     cChar:      Char;
begin

  // Is the pattern null
  if (FPos^ = #0) then RaiseError(3, True);

  // Check for invalid meta characters
  if (FPos^ in REG_METACHARS) then RaiseError(4, True);

  // Check for escaped character
  if (FPos^ = '\') then
     cChar:=ParseEscChar
  else
     // Literal character
     cChar:=FPos^;

  // Push next
  Inc(FPos);

  // Allocate a new state
  prsNode:=AllocState(rtChar);
  if not(FMatchCase) then
     prsNode^.cChar:=UpCase(cChar)
  else
     prsNode^.cChar:=cChar;

  try
     // Parse out any post modifier
     ParsePostModifier(prsNode);
  except
     // Chance to free the memory
     FreeState(prsNode);
     // Re-raise the exception
     raise;
  end;

  // Add to state table
  FTable.Add(prsNode);

end;

function TRegExpr.ParseEscChar: Char;
begin

  // Push next
  Inc(FPos);

  // Check the next character
  case FPos^ of
     #0    :
     begin
        result:=#0;
        RaiseError(3, True);
     end;
     'a'   :  result:=#7;
     'b'   :  result:=#8;
     'f'   :  result:=#12;
     'n'   :  result:=#10;
     'r'   :  result:=#13;
     't'   :  result:=#9;
     's'   :  result:=#32;
     'v'   :  result:=#11;
  else
     // Escape as is
     result:=FPos^;
  end;

end;

function TRegExpr.ParseCCChar: Char;
begin

  // Is the pattern null
  if (FPos^ = #0) then RaiseError(3, True);

  // Check for invalid meta characters
  if (FPos^ in REG_METACHARS) then RaiseError(4, True);

  // Check for escaped character
  if (FPos^ = '\') then
     result:=ParseEscChar
  else
     // Literal character
     result:=FPos^;

  // Push next
  Inc(FPos);

end;

procedure TRegExpr.ParseSpecialRange(CClass: PCharSet);
var  cChar:      Char;
begin

  // Push next
  Inc(FPos);

  // Is the pattern null
  if (FPos^ = #0) then RaiseError(3, True);

  // Check for invalid meta characters
  if (FPos^ in REG_METACHARS) then RaiseError(4, True);

  // Handle the special char range
  case FPos^ of
     // Alpha, numeric
     'a'   :
     begin
        for cChar:='0' to '9' do Include(CClass^, cChar);
        for cChar:='A' to 'Z' do Include(CClass^, cChar);
        for cChar:='a' to 'z' do Include(CClass^, cChar);
     end;
     // Alpha
     'c'   :
     begin
        for cChar:='A' to 'Z' do Include(CClass^, cChar);
        for cChar:='a' to 'z' do Include(CClass^, cChar);
     end;
     // Numeric
     'd'   :  for cChar:='0' to '9' do Include(CClass^, cChar);
     // Hexadecimal
     'h'   :
     begin
        for cChar:='0' to '9' do Include(CClass^, cChar);
        for cChar:='A' to 'F' do Include(CClass^, cChar);
        for cChar:='a' to 'f' do Include(CClass^, cChar);
     end;
     // Alpha, numeric, underscore, dash, dollar
     'i'   :
     begin
        Include(CClass^, '_');
        Include(CClass^, '-');
        Include(CClass^, '$');
        for cChar:='0' to '9' do Include(CClass^, cChar);
        for cChar:='A' to 'Z' do Include(CClass^, cChar);
        for cChar:='a' to 'z' do Include(CClass^, cChar);
     end;
     // White space
     'w'   :
     begin
        Include(CClass^, #9);
        Include(CClass^, #10);
        Include(CClass^, #13);
        Include(CClass^, #32);
     end;
  else
     // Invalid identifier
     RaiseError(6, True);
  end;

  // Push next
  Inc(FPos);

end;

procedure TRegExpr.ParseCharRange(CClass : PCharSet);
var  cStart:     Char;
     cEnd:       Char;
     cSwap:      Char;
begin

  // Check for special char range
  if (FPos^ = '%') then
     ParseSpecialRange(CClass)
  else
  begin
     // Parse a single character
     cStart:=ParseCCChar;
     // Check for character range
     if (FPos^ = '-') then
     begin
        // Range of characters
        Inc(FPos);
        // Parse the next character
        cEnd:=ParseCCChar;
        // Build the range as a character set
        if (cStart > cEnd) then
        begin
           // Exchange first and last
           cSwap:=cStart;
           cStart:=cEnd;
           cEnd:=cSwap;
        end;
        // Include the characters in the character set
        for cSwap:=cStart to cEnd do
        begin
           Include(CClass^, cSwap);
           if not(FMatchCase) then Include(CClass^, UpCase(cSwap));
        end;
     end
     else
     begin
        // Single character
        if FMatchCase then
           Include(CClass^, cStart)
        else
           Include(CClass^, UpCase(cStart));
     end;
  end;

end;

procedure TRegExpr.ParseCharClass;
var  prsNode:    PRegState;
     bNegate:    Boolean;
begin

  // Push next
  Inc(FPos);

  // Is this a negated class?
  bNegate:=(FPos^ = '^');
  if bNegate then Inc(FPos);

  // Check for an immediate ], which means an empty set (which is not allowed)
  if (FPos^ = ']') then RaiseError(7, True);

  // Allocate a state record and char class
  prsNode:=AllocState(REG_NEGCLASS[bNegate]);
  prsNode^.lpClass:=AllocMem(SizeOf(TCharSet));

  // Start parsing the character ranges until we hit the closing bracket
  try
     while not(FPos^ in [#0, ']']) do ParseCharRange(prsNode^.lpClass);
     // We must have the ending bracket
     if (FPos^ = #0) then RaiseError(3, True);
     // Push next
     Inc(FPos);
     // Parse out any post modifier
     ParsePostModifier(prsNode);
  except
     // Gives us the chance to clean up
     FreeState(prsNode);
     // Re-raise the exception
     raise;
  end;

  // Add to state table
  FTable.Add(prsNode);

end;

procedure TRegExpr.ParseAnyChar;
var  prsNode:    PRegState;
begin

  // Push next
  Inc(FPos);

  // Add the new state record
  prsNode:=AllocState(rtAnyChar);
  FTable.Add(prsNode);

end;

procedure TRegExpr.ParsePostModifier(State: PRegState);
begin

  // Set default post modifier
  State^.regPost:=rpOne;

  // If not a post modifier then just exit
  if not(FPos^ in ['*', '?', '+', '{']) then exit;

  // Set the post modifier
  case FPos^ of
     '*'   :  State^.regPost:=rpZMore;
     '?'   :  State^.regPost:=rpZOne;
     '+'   :  State^.regPost:=rpOneMore;
     '{'   :  State^.regPost:=rpExact;
  end;

  // Push next
  Inc(FPos);

  // If *, ?, or + then we are done
  if (State^.regPost <> rpExact) then exit;

  // Get the first number
  State^.dwMin:=ParseNum;

  // Check for end of expression
  if (FPos^ = #0) then RaiseError(3, True);

  // Next character needs to be either a , or }
  case FPos^ of
     ','   :
     begin
        // Push next
        Inc(FPos);
        // Check for end of expression
        if (FPos^ = #0) then RaiseError(3, True);
        // We may have either a closing } or a numeric
        if (FPos^ = '}') then
           State^.regPost:=rpCMore
        else
        begin
           // Going to be a min to max range
           State^.regPost:=rpCtoC;
           // Get max range count
           State^.dwMax:=ParseNum;
           // Check for end of expression
           if (FPos^ = #0) then RaiseError(3, True);
           // Min has to be less than Max
           if (State^.dwMin >= State^.dwMax) then RaiseError(8, True);
           // Check for closing }
           if (FPos^ <> '}') then RaiseError(9, True);
        end;
     end;
     '}'   :  if (State^.dwMin = 0) then RaiseError(10, True);
  else
     RaiseError(9, True);
  end;

  // Push next
  Inc(FPos);

end;

function TRegExpr.ParseNum: Integer;
var  lpszHold:   PChar;
     cMark:      Char;
     dwErr:      Integer;
begin

  // Save the current position
  lpszHold:=FPos;

  // Better be a number waiting
  if not(FPos^ in ['0'..'9']) then RaiseError(11, True);

  // Keep incrementing while numeric
  while (FPos^ in ['0'..'9']) do Inc(FPos);

  // Terminate and convert to number
  cMark:=FPos^;
  FPos^:=#0;
  Val(lpszHold, result, dwErr);
  FPos^:=cMark;

  // Check for error
  if (dwErr > 0) then
  begin
     FPos:=lpszHold;
     Inc(FPos, dwErr);
     RaiseError(12, True);
  end;

end;

procedure TRegExpr.ResetAll;
begin

  // Reset scanning positions
  FCursor:=FSource;
  FAnchor:=FSource;

  // Reset matched state
  FMatched:=False;
  FMatchPos:=0;
  FMatchLen:=0;

end;

procedure TRegExpr.ResetMatched;
begin

  // Set anchor and cursor to equal source
  FMatched:=False;
  FMatchPos:=0;
  FMatchLen:=0;

end;

procedure TRegExpr.SetMatchCase(CaseSensitive: Boolean);
begin

  // Check current setting
  if (CaseSensitive <> FMatchCase) then
  begin
     // Set new case sensitive setting
     FMatchCase:=CaseSensitive;
     // Reparse if we were in a parsed state
     if FParsed then ParseExpression;
  end;

end;

function TRegExpr.GetSource: PChar;
begin

  // Return the source
  result:=FSource;

end;

function TRegExpr.GetPattern: PChar;
begin

  // Return the pattern
  result:=FPattern;

end;

procedure TRegExpr.SetSource(Source: PChar);
begin

  // Assign the new source
  FreeMem(FSource);
  FSource:=AllocMem(Succ(StrLen(Source)));
  StrCopy(FSource, Source);

  // Source has changed, update the new state
  ResetAll;

end;

procedure TRegExpr.SetPattern(Pattern: PChar);
begin

  // Assign the new pattern
  FreeMem(FPattern);
  FPattern:=AllocMem(Succ(StrLen(Pattern)));
  StrCopy(FPattern, Pattern);

  // Reset parsed state
  FParsed:=False;

  // Parse the expression
  ParseExpression;

  // Set parsed state to true
  FParsed:=True;

end;

procedure TRegExpr.FreeState(State: PRegState);
begin

  // Is the state record assigned?
  if Assigned(State) then
  begin
     // If char class is allocated then we need to free the memory for the char set
     if (State^.regType in [rtClass, rtNegClass]) then FreeMem(State^.lpClass);
     // Free the state record
     FreeMem(State);
  end;

end;

procedure TRegExpr.ClearStateTable;
var  dwIndex:    Integer;
begin

  // Clear all the NFA state records
  for dwIndex:=Pred(FTable.Count) downto 0 do FreeState(FTable[dwIndex]);

  // Clear the table
  FTable.Clear;

end;

function TRegExpr.AllocState(StateType: TRegType): PRegState;
begin

  // Allocate a new regx state record
  result:=AllocMem(SizeOf(TRegState));

  // Set the type
  result^.regType:=StateType;

end;

constructor TRegExpr.CreatePattern(Pattern: PChar);
begin

  // Perform inherited for TRegExpr
  Create;

  // Set the pattern
  SetPattern(Pattern);

end;

constructor TRegExpr.Create;
begin

  // Perform inherited
  inherited Create;

  // Initialization
  FPattern:=AllocMem(1);
  FSource:=AllocMem(1);
  FError:=AllocMem(256);
  FPos:=FPattern;
  FAncStart:=False;
  FAncEnd:=False;
  FMatchCase:=False;
  FTable:=TList.Create;
  FParsed:=False;

  // Reset to default (unmatched) state
  ResetAll;

end;

destructor TRegExpr.Destroy;
begin

  // Clear the state records
  ClearStateTable;

  // Free the state table list
  FTable.Free;

  // Free all allocated memory
  FreeMem(FPattern);
  FreeMem(FSource);
  FreeMem(FError);

  // Perform inherited
  inherited Destroy;

end;

end.
0
 
DragonSlayerCommented:
Wow Russell... your own reg exp? You must be too free :-P
0

Featured Post

Keep up with what's happening at Experts Exchange!

Sign up to receive Decoded, a new monthly digest with product updates, feature release info, continuing education opportunities, and more.

  • 2
Tackle projects and never again get stuck behind a technical roadblock.
Join Now