New versions straight from Jeffrey Ollie's web site

author: Guido van Rossum <guido@python.org> 1997-07-10 14:31:32 (GMT)
committer: Guido van Rossum <guido@python.org> 1997-07-10 14:31:32 (GMT)
commit: db25f32849730915291993ece2023c1645ff73d2 (patch)
tree: 5e9b3dccbc4c005f92f6a4b4820917e7c9a12629 /Modules
parent: db9e20f41802c104d688002af22a090b60a656b4 (diff)
download: cpython-db25f32849730915291993ece2023c1645ff73d2.zip
cpython-db25f32849730915291993ece2023c1645ff73d2.tar.gz
cpython-db25f32849730915291993ece2023c1645ff73d2.tar.bz2
3 files changed, 700 insertions, 169 deletions
diff --git a/Modules/regexpr.c b/Modules/regexpr.c
index 87f747f..0b6ec8b 100644
--- a/Modules/regexpr.c
+++ b/Modules/regexpr.c
@@ -1,3 +1,7 @@
+/*
+ * -*- mode: c-mode; c-file-style: python -*-
+ */
+
 /* regexpr.c
  *
  * Author: Tatu Ylonen <ylo@ngs.fi>
@@ -57,6 +61,12 @@ char *realloc();
 #endif /* __STDC__ */
 #endif /* THINK_C */
 
+/* The original code blithely assumed that sizeof(short) == 2.  Not
+ * always true.  Original instances of "(short)x" were replaced by
+ * SHORT(x), where SHORT is #defined below.  */
+
+#define SHORT(x) ((x) & 0x8000 ? (x) - 0x10000 : (x))
+
 /* The stack implementation is taken from an idea by Andrew Kuchling.
  * It's a doubly linked list of arrays. The advantages of this over a
  * simple linked list are that the number of mallocs required are
@@ -75,27 +85,27 @@ char *realloc();
 
 typedef union item_t
 {
-      struct
-      {
-	    int num;
-	    int level;
-	    char *start;
-	    char *end;
-      } reg;
-      struct
-      {
-	    int count;
-	    int level;
-	    int phantom;
-	    char *code;
-	    char *text;
-      } fail;
-      struct
-      {
-	    int num;
-	    int level;
-	    int count;
-      } cntr;
+	struct
+	{
+		int num;
+		int level;
+		char *start;
+		char *end;
+	} reg;
+	struct
+	{
+		int count;
+		int level;
+		int phantom;
+		char *code;
+		char *text;
+	} fail;
+	struct
+	{
+		int num;
+		int level;
+		int count;
+	} cntr;
 } item_t;
 
 #define STACK_PAGE_SIZE 256
@@ -105,43 +115,98 @@ typedef union item_t
 
 typedef struct item_page_t
 {
-      item_t items[STACK_PAGE_SIZE];
-      struct item_page_t *prev;
-      struct item_page_t *next;
+	item_t items[STACK_PAGE_SIZE];
+	struct item_page_t *prev;
+	struct item_page_t *next;
 } item_page_t;
 
 
 typedef struct match_state
 {
-      /* Structure to encapsulate the stack. */
-      struct
-      {
-	    /* index into the curent page.  If index == 0 and you need
-	     * to pop and item, move to the previous page and set
-	     * index = STACK_PAGE_SIZE - 1.  Otherwise decrement index
-	     * to push a page. If index == STACK_PAGE_SIZE and you
-	     * need to push a page move to the next page and set index
-	     * = 0. If there is no new next page, allocate a new page
-	     * and link it in. Otherwise, increment index to push a
-	     * page. */
-	    int index;
-	    item_page_t *current; /* Pointer to the current page. */
-	    item_page_t first; /* First page is statically allocated. */
-      } stack;
-      char *start[NUM_REGISTERS];
-      char *end[NUM_REGISTERS];
-
-      int changed[NUM_REGISTERS];
-      /* The number of registers that have been pushed onto the stack
-       * since the last failure point. */
-      int count;
-      /* Used to control when registers need to be pushed onto the
-       * stack. */
-      int level;
-      /* The number of failure points on the stack. */
-      int point;
+	/* The number of registers that have been pushed onto the stack
+	 * since the last failure point. */
+
+	int count;
+
+	/* Used to control when registers need to be pushed onto the
+	 * stack. */
+	
+	int level;
+	
+	/* The number of failure points on the stack. */
+	
+	int point;
+	
+	/* Storage for the registers.  Each register consists of two
+	 * pointers to characters.  So register N is represented as
+	 * start[N] and end[N].  The pointers must be converted to
+	 * offsets from the beginning of the string before returning the
+	 * registers to the calling program. */
+	
+	char *start[NUM_REGISTERS];
+	char *end[NUM_REGISTERS];
+	
+	/* Keeps track of whether a register has changed recently. */
+	
+	int changed[NUM_REGISTERS];
+	
+	/* Structure to encapsulate the stack. */
+	struct
+	{
+		/* index into the curent page.  If index == 0 and you need
+		 * to pop an item, move to the previous page and set index
+		 * = STACK_PAGE_SIZE - 1.  Otherwise decrement index to
+		 * push a page. If index == STACK_PAGE_SIZE and you need
+		 * to push a page move to the next page and set index =
+		 * 0. If there is no new next page, allocate a new page
+		 * and link it in. Otherwise, increment index to push a
+		 * page. */
+		
+		int index;
+		item_page_t *current; /* Pointer to the current page. */
+		item_page_t first; /* First page is statically allocated. */
+	} stack;
 } match_state;
 
+/* Initialize a state object */
+
+/* #define NEW_STATE(state) \ */
+/* memset(&state, 0, (void *)(&state.stack) - (void *)(&state)); \ */
+/* state.stack.current = &state.stack.first; \ */
+/* state.stack.first.prev = NULL; \ */
+/* state.stack.first.next = NULL; \ */
+/* state.stack.index = 0; \ */
+/* state.level = 1 */
+
+#define NEW_STATE(state, nregs) \
+{ \
+	int i; \
+	for (i = 0; i < nregs; i++) \
+	{ \
+		state.start[i] = NULL; \
+		state.end[i] = NULL; \
+		state.changed[i] = 0; \
+	} \
+	state.stack.current = &state.stack.first; \
+	state.stack.first.prev = NULL; \
+	state.stack.first.next = NULL; \
+	state.stack.index = 0; \
+	state.level = 1; \
+	state.count = 0; \
+	state.level = 0; \
+	state.point = 0; \
+}
+
+/* Free any memory that might have been malloc'd */
+
+#define FREE_STATE(state) \
+while(state.stack.first.next != NULL) \
+{ \
+	state.stack.current = state.stack.first.next; \
+	state.stack.first.next = state.stack.current->next; \
+	free(state.stack.current); \
+}
+
 /* Discard the top 'count' stack items. */
 
 #define STACK_DISCARD(stack, count, on_error) \
@@ -226,24 +291,6 @@ else \
 #define STACK_EMPTY(stack) ((stack.index == 0) && \
 			    (stack.current->prev == NULL))
 
-
-/* Initialize a state object */
-
-#define NEW_STATE(state) \
-memset(&state, 0, sizeof(match_state)); \
-state.stack.current = &state.stack.first; \
-state.level = 1
-
-/* Free any memory that might have been malloc'd */
-
-#define FREE_STATE(state) \
-while(state.stack.first.next != NULL) \
-{ \
-   state.stack.current = state.stack.first.next; \
-   state.stack.first.next = state.stack.current->next; \
-   free(state.stack.current); \
-}
-
 /* Return the start of register 'reg' */
 
 #define GET_REG_START(state, reg) (state.start[reg])
@@ -302,22 +349,6 @@ state.end[reg] = text
 
 /* Update the last failure point with a new position in the text. */
 
-/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
-/* { \ */
-/*    item_t *item; \ */
-/*    STACK_DISCARD(state.stack, state.count, on_error); \ */
-/*    STACK_TOP(state.stack, item, on_error); \ */
-/*    item->fail.text = xtext; \ */
-/*    state.count = 0; \ */
-/* } */
-
-/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
-/* { \ */
-/*    item_t *item; \ */
-/*    STACK_BACK(state.stack, item, state.count + 1, on_error); \ */
-/*    item->fail.text = xtext; \ */
-/* } */
-
 #define UPDATE_FAILURE(state, xtext, on_error) \
 { \
    item_t *item; \
@@ -391,7 +422,8 @@ enum regexp_compiled_ops /* opcodes for compiled regexp */
   Cwordbound,		/* match if at word boundary */
   Cnotwordbound,	/* match if not at word boundary */
   Csyntaxspec,		/* matches syntax code (1 byte follows) */
-  Cnotsyntaxspec	/* matches if syntax code does not match (1 byte foll)*/
+  Cnotsyntaxspec,	/* matches if syntax code does not match (1 byte foll)*/
+  Crepeat1
 };
 
 enum regexp_syntax_op	/* syntax codes for plain and quoted characters */
@@ -581,6 +613,8 @@ static void re_compile_fastmap_aux(char *code,
 	 case Cwordbound:
 	 case Cnotwordbound:
 	 {
+	    for (a = 0; a < 256; a++)
+	       fastmap[a] = 1;
 	    break;
 	 }
 	 case Csyntaxspec:
@@ -648,7 +682,7 @@ static void re_compile_fastmap_aux(char *code,
 	 {
 	    a = (unsigned char)code[pos++];
 	    a |= (unsigned char)code[pos++] << 8;
-	    pos += (int)(short)a;
+	    pos += (int)SHORT(a);
 	    if (visited[pos])
 	    {
 	       /* argh... the regexp contains empty loops.  This is not
@@ -664,10 +698,15 @@ static void re_compile_fastmap_aux(char *code,
 	 {
 	    a = (unsigned char)code[pos++];
 	    a |= (unsigned char)code[pos++] << 8;
-	    a = pos + (int)(short)a;
+	    a = pos + (int)SHORT(a);
 	    re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
 	    break;
 	 }
+	 case Crepeat1:
+	 {
+	    pos += 2;
+	    break;
+	 }
 	 default:
 	 {
 	    abort();  /* probably some opcode is missing from this switch */
@@ -754,10 +793,11 @@ static int re_optimize_star_jump(regexp_t bufp, char *code)
    char ch;
    int a;
    int b;
+   int num_instructions = 0;
 
    a = (unsigned char)*code++;
    a |= (unsigned char)*code++ << 8;
-   a = (int)(short)a;
+   a = (int)SHORT(a);
 
    p1 = code + a + 3; /* skip the failure_jump */
    assert(p1[-3] == Cfailure_jump);
@@ -775,6 +815,7 @@ static int re_optimize_star_jump(regexp_t bufp, char *code)
       
    /* loop until we find something that consumes a character */
   loop_p1:
+   num_instructions++;
    switch (*p1++)
    {
       case Cbol:
@@ -824,6 +865,7 @@ static int re_optimize_star_jump(regexp_t bufp, char *code)
    /* now we know that we can't backtrack. */
    while (p1 != p2 - 3)
    {
+      num_instructions++;
       switch (*p1++)
       {
 	 case Cend:
@@ -873,11 +915,22 @@ static int re_optimize_star_jump(regexp_t bufp, char *code)
       }
    }
 
+  make_update_jump:
    code -= 3;
    a += 3;  /* jump to after the Cfailure_jump */
    code[0] = Cupdate_failure_jump;
    code[1] = a & 0xff;
    code[2] = a >> 8;
+   if (num_instructions > 1)
+      return 1;
+   assert(num_instructions == 1);
+   /* if the only instruction matches a        single character, we can do
+    * better
+    */
+   p1 = code + 3 + a;   /* start of sole instruction */
+   if (*p1 == Cset || *p1 == Cexact || *p1 == Canychar ||
+       *p1 == Csyntaxspec || *p1 == Cnotsyntaxspec)
+      code[0] =        Crepeat1;
    return 1;
 
   make_normal_jump:
@@ -939,6 +992,7 @@ static int re_optimize(regexp_t bufp)
 	 case Cjump:
 	 case Cdummy_failure_jump:
 	 case Cfailure_jump:
+	 case Crepeat1:
 	 {
 	    code += 2;
 	    break;
@@ -1111,7 +1165,8 @@ char *re_compile_pattern(char *regex, int size, regexp_t bufp)
       re_compile_initialize();
    bufp->used = 0;
    bufp->fastmap_accurate = 0;
-   bufp->uses_registers = 0;
+   bufp->uses_registers = 1;
+   bufp->num_registers = 1;
    translate = bufp->translate;
    pattern = bufp->buffer;
    alloc = bufp->allocated;
@@ -1289,6 +1344,7 @@ char *re_compile_pattern(char *regex, int size, regexp_t bufp)
 	       STORE(Cstart_memory);
 	       STORE(next_register);
 	       open_registers[num_open_registers++] = next_register;
+	       bufp->num_registers++;
 	       next_register++;
 	    }
 	    paren_depth++;
@@ -1545,27 +1601,8 @@ int re_match(regexp_t bufp,
   code = bufp->buffer;
   
   translate = bufp->translate;
-/*   translated = NULL; */
-/*   if (bufp->translate) */
-/*   { */
-/*      char *t1; */
-/*      char *t2; */
-     
-/*      translated = malloc(size); */
-/*      if (translated == NULL) */
-/* 	goto error; */
-
-/*      t1 = string; */
-/*      t2 = translated; */
-/*      while(t1 < textend) */
-/* 	*t2++ = bufp->translate[*t1++]; */
-     
-/*      text = translated + pos; */
-/*      textstart = translated; */
-/*      textend = translated + size; */
-/*   } */
   
-  NEW_STATE(state);
+  NEW_STATE(state, bufp->num_registers);
   
   continue_matching:
   switch (*code++)
@@ -1587,7 +1624,7 @@ int re_match(regexp_t bufp,
 	   }
 	   else
 	   {
-	      for (a = 1; a < RE_NREGS; a++)
+	      for (a = 1; a < bufp->num_registers; a++)
 	      {
 		 if ((GET_REG_START(state, a) == NULL) ||
 		     (GET_REG_END(state, a) == NULL))
@@ -1599,10 +1636,13 @@ int re_match(regexp_t bufp,
 		 old_regs->start[a] = GET_REG_START(state, a) - textstart;
 		 old_regs->end[a] = GET_REG_END(state, a) - textstart;
 	      }
+	      for (; a < RE_NREGS; a++)
+	      {
+		 old_regs->start[a] = -1;
+		 old_regs->end[a] = -1;
+	      }
 	   }
 	}
-/* 	if(translated) */
-/* 	   free(translated); */
 	FREE_STATE(state);
 	return match_end - pos;
      }
@@ -1703,18 +1743,18 @@ int re_match(regexp_t bufp,
      {
 	a = (unsigned char)*code++;
 	a |= (unsigned char)*code++ << 8;
-	code += (int)(short)a;
+	code += (int)SHORT(a);
 	goto continue_matching;
      }
      case Cdummy_failure_jump:
      {
 	a = (unsigned char)*code++;
 	a |= (unsigned char)*code++ << 8;
-	a = (int)(short)a;
+	a = (int)SHORT(a);
 	assert(*code == Cfailure_jump);
 	b = (unsigned char)code[1];
 	b |= (unsigned char)code[2] << 8;
-	PUSH_FAILURE(state, code + (int)(short)b + 3, NULL, goto error);
+	PUSH_FAILURE(state, code + (int)SHORT(b) + 3, NULL, goto error);
 	code += a;
 	goto continue_matching;
      }
@@ -1722,10 +1762,120 @@ int re_match(regexp_t bufp,
      {
 	a = (unsigned char)*code++;
 	a |= (unsigned char)*code++ << 8;
-	a = (int)(short)a;
+	a = (int)SHORT(a);
 	PUSH_FAILURE(state, code + a, text, goto error);
 	goto continue_matching;
      }
+     case Crepeat1:
+     {
+	char *pinst;
+	a = (unsigned char)*code++;
+	a |= (unsigned char)*code++ << 8;
+	a = (int)SHORT(a);
+	pinst = code + a;
+	/* pinst is sole instruction in loop, and it matches a
+	 * single character.  Since Crepeat1 was originally a
+	 * Cupdate_failure_jump, we also know that backtracking is
+	 * useless:  so long as the single-character expression matches,
+	 * it must be used.  Also, in the case of +, we've already
+	 * matched one character, so + can't fail:  nothing here can
+	 * cause a failure.
+	 */
+	switch (*pinst++)
+	{
+	   case Cset:
+	   {
+              if (translate)
+	      {
+		 while (text < textend)
+		 {
+		    ch = translate[(unsigned char)*text];
+		    if (pinst[ch/8] & (1<<(ch & 7)))
+		       text++;
+		    else
+		       break;
+		 }
+              }
+              else
+              {
+		 while (text < textend)
+		 {
+		    ch = (unsigned char)*text;
+		    if (pinst[ch/8] & (1<<(ch & 7)))
+		       text++;
+		    else
+		       break;
+		 }
+              }
+	      break;
+	   }
+	   case Cexact:
+	   {
+	      ch = (unsigned char)*pinst;
+              if (translate)
+	      {
+		 while (text < textend &&
+			translate[(unsigned char)*text] == ch)
+		    text++;
+              }
+              else
+              {
+		 while (text < textend && (unsigned char)*text == ch)
+		    text++;
+              }
+	      break;
+	   }
+	   case Canychar:
+	   {
+	      while (text < textend && (unsigned char)*text != '\n')
+		 text++;
+	      break;
+	   }
+	   case Csyntaxspec:
+	   {
+	      a = (unsigned char)*pinst;
+              if (translate)
+	      {
+		 while (text < textend &&
+			translate[SYNTAX(*text)] == a)
+		    text++;
+              }
+              else
+              {
+                while (text < textend && SYNTAX(*text) == a)
+                   text++;
+              }
+	      break;
+	   }
+	   case Cnotsyntaxspec:
+	   {
+	      a = (unsigned char)*pinst;
+              if (translate)
+	      {
+		 while (text < textend &&
+			translate[SYNTAX(*text)] != a)
+		    text++;
+              }
+              else
+              {
+		 while (text < textend && SYNTAX(*text) != a)
+		    text++;
+              }
+	      break;
+	   }
+	   default:
+	   {
+	      abort();
+	      /*NOTREACHED*/
+	   }
+	}
+	/* due to the funky way + and * are compiled, the top failure-
+	 * stack entry at this point is actually a success entry --
+	 * update it & pop it
+	 */
+	UPDATE_FAILURE(state, text, goto error);
+	goto fail;      /* i.e., succeed <wink/sigh> */
+     }
      case Cbegbuf:
      {
 	if (text == textstart)
diff --git a/Modules/regexpr.h b/Modules/regexpr.h
index e623362..1221802 100644
--- a/Modules/regexpr.h
+++ b/Modules/regexpr.h
@@ -1,3 +1,7 @@
+/*
+ * -*- mode: c-mode; c-file-style: python -*-
+ */
+
 #ifndef Py_REGEXPR_H
 #define Py_REGEXPR_H
 #ifdef __cplusplus
@@ -5,22 +9,22 @@ extern "C" {
 #endif
 
 /*
-
-regexpr.h
-
-Author: Tatu Ylonen <ylo@ngs.fi>
-
-Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
-
-Permission to use, copy, modify, distribute, and sell this software
-and its documentation for any purpose is hereby granted without fee,
-provided that the above copyright notice appear in all copies.  This
-software is provided "as is" without express or implied warranty.
-
-Created: Thu Sep 26 17:15:36 1991 ylo
-Last modified: Mon Nov  4 15:49:46 1991 ylo
-
-*/
+ * regexpr.h
+ *
+ * Author: Tatu Ylonen <ylo@ngs.fi>
+ *
+ * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without fee,
+ * provided that the above copyright notice appear in all copies.  This
+ * software is provided "as is" without express or implied warranty.
+ *
+ * Created: Thu Sep 26 17:15:36 1991 ylo
+ * Last modified: Mon Nov  4 15:49:46 1991 ylo
+ */
+
+/* $Id$ */
 
 #ifndef REGEXPR_H
 #define REGEXPR_H
@@ -29,21 +33,22 @@ Last modified: Mon Nov  4 15:49:46 1991 ylo
 
 typedef struct re_pattern_buffer
 {
-  char *buffer; 	 /* compiled pattern */
-  int allocated;	 /* allocated size of compiled pattern */
-  int used;		 /* actual length of compiled pattern */
-  char *fastmap;	 /* fastmap[ch] is true if ch can start pattern */
-  char *translate;	 /* translation to apply during compilation/matching */
-  char fastmap_accurate; /* true if fastmap is valid */
-  char can_be_null;	 /* true if can match empty string */
-  char uses_registers;	 /* registers are used and need to be initialized */
-  char anchor;		 /* anchor: 0=none 1=begline 2=begbuf */
+	char *buffer;          /* compiled pattern */
+	int allocated;         /* allocated size of compiled pattern */
+	int used;              /* actual length of compiled pattern */
+	char *fastmap;         /* fastmap[ch] is true if ch can start pattern */
+	char *translate;       /* translation to apply during compilation/matching */
+	char fastmap_accurate; /* true if fastmap is valid */
+	char can_be_null;      /* true if can match empty string */
+	char uses_registers;   /* registers are used and need to be initialized */
+	int num_registers;     /* number of registers used */
+	char anchor;           /* anchor: 0=none 1=begline 2=begbuf */
 } *regexp_t;
 
 typedef struct re_registers
 {
-  int start[RE_NREGS];  /* start offset of region */
-  int end[RE_NREGS];    /* end offset of region */
+	int start[RE_NREGS];  /* start offset of region */
+	int end[RE_NREGS];    /* end offset of region */
 } *regexp_registers_t;
 
 /* bit definitions for syntax */
@@ -77,52 +82,53 @@ typedef struct re_registers
 #ifdef HAVE_PROTOTYPES
 
 extern int re_syntax;
-/* This is the actual syntax mask.  It was added so that Python
-   could do syntax-dependent munging of patterns before compilation. */
+/* This is the actual syntax mask.  It was added so that Python could do
+ * syntax-dependent munging of patterns before compilation. */
 
 int re_set_syntax(int syntax);
 /* This sets the syntax to use and returns the previous syntax.  The
-   syntax is specified by a bit mask of the above defined bits. */
+ * syntax is specified by a bit mask of the above defined bits. */
 
 char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled);
 /* This compiles the regexp (given in regex and length in regex_size).
-   This returns NULL if the regexp compiled successfully, and an error
-   message if an error was encountered.  The buffer field must be
-   initialized to a memory area allocated by malloc (or to NULL) before
-   use, and the allocated field must be set to its length (or 0 if buffer is
-   NULL).  Also, the translate field must be set to point to a valid
-   translation table, or NULL if it is not used. */
+ * This returns NULL if the regexp compiled successfully, and an error
+ * message if an error was encountered.  The buffer field must be
+ * initialized to a memory area allocated by malloc (or to NULL) before
+ * use, and the allocated field must be set to its length (or 0 if
+ * buffer is NULL).  Also, the translate field must be set to point to a
+ * valid translation table, or NULL if it is not used. */
 
 int re_match(regexp_t compiled, char *string, int size, int pos,
 	     regexp_registers_t old_regs);
 /* This tries to match the regexp against the string.  This returns the
-   length of the matched portion, or -1 if the pattern could not be
-   matched and -2 if an error (such as failure stack overflow) is
-   encountered. */
+ * length of the matched portion, or -1 if the pattern could not be
+ * matched and -2 if an error (such as failure stack overflow) is
+ * encountered. */
 
 int re_search(regexp_t compiled, char *string, int size, int startpos,
 	      int range, regexp_registers_t regs);
-/* This rearches for a substring matching the regexp.  This returns the first
-   index at which a match is found.  range specifies at how many positions to
-   try matching; positive values indicate searching forwards, and negative
-   values indicate searching backwards.  mstop specifies the offset beyond
-   which a match must not go.  This returns -1 if no match is found, and
-   -2 if an error (such as failure stack overflow) is encountered. */
+/* This rearches for a substring matching the regexp.  This returns the
+ * first index at which a match is found.  range specifies at how many
+ * positions to try matching; positive values indicate searching
+ * forwards, and negative values indicate searching backwards.  mstop
+ * specifies the offset beyond which a match must not go.  This returns
+ * -1 if no match is found, and -2 if an error (such as failure stack
+ * overflow) is encountered. */
 
 void re_compile_fastmap(regexp_t compiled);
 /* This computes the fastmap for the regexp.  For this to have any effect,
-   the calling program must have initialized the fastmap field to point
-   to an array of 256 characters. */
+ * the calling program must have initialized the fastmap field to point
+ * to an array of 256 characters. */
 
 char *re_comp(char *s);
 /* BSD 4.2 regex library routine re_comp.  This compiles the regexp into
-   an internal buffer.  This returns NULL if the regexp was compiled
-   successfully, and an error message if there was an error. */
+ * an internal buffer.  This returns NULL if the regexp was compiled
+ * successfully, and an error message if there was an error. */
 
 int re_exec(char *s);
-/* BSD 4.2 regexp library routine re_exec.  This returns true if the string
-   matches the regular expression (that is, a matching part is found
-   anywhere in the string). */
+/* BSD 4.2 regexp library routine re_exec.  This returns true if the
+ * string matches the regular expression (that is, a matching part is
+ * found anywhere in the string). */
 
 #else /* HAVE_PROTOTYPES */
 
diff --git a/Modules/reopmodule.c b/Modules/reopmodule.c
new file mode 100644
index 0000000..0d12210
--- /dev/null
+++ b/Modules/reopmodule.c
@@ -0,0 +1,375 @@
+/***********************************************************
+Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
+The Netherlands.
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the names of Stichting Mathematisch
+Centrum or CWI or Corporation for National Research Initiatives or
+CNRI not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+While CWI is the initial source for this software, a modified version
+is made available by the Corporation for National Research Initiatives
+(CNRI) at the Internet address ftp://ftp.python.org.
+
+STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
+CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
+DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+
+******************************************************************/
+
+/* $Id$ */
+
+/* Regular expression objects */
+/* This uses Tatu Ylonen's copyleft-free reimplementation of
+   GNU regular expressions */
+
+#include "Python.h"
+
+#include <ctype.h>
+
+#include "regexpr.h"
+
+static PyObject *ReopError;	/* Exception */	
+
+static PyObject *
+makeresult(regs, num_regs)
+	struct re_registers *regs;
+	int num_regs;
+{
+	PyObject *v;
+	int i;
+	static PyObject *filler = NULL;
+
+	if (filler == NULL) {
+		filler = Py_BuildValue("(ii)", -1, -1);
+		if (filler == NULL)
+			return NULL;
+	}
+	v = PyTuple_New(num_regs);
+	if (v == NULL)
+		return NULL;
+
+	for (i = 0; i < num_regs; i++) {
+		int lo = regs->start[i];
+		int hi = regs->end[i];
+		PyObject *w;
+		if (lo == -1 && hi == -1) {
+			w = filler;
+			Py_INCREF(w);
+		}
+		else
+			w = Py_BuildValue("(ii)", lo, hi);
+		if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
+			Py_DECREF(v);
+			return NULL;
+		}
+	}
+	return v;
+}
+
+static PyObject *
+reop_match(self, args)
+	PyObject *self;
+	PyObject *args;
+{
+	char *string;
+	int fastmaplen, stringlen;
+	int can_be_null, anchor, i;
+	int num_regs, flags, pos, result;
+	struct re_pattern_buffer bufp;
+	struct re_registers re_regs;
+	
+	if (!PyArg_Parse(args, "(s#iiis#is#i)", 
+			 &(bufp.buffer), &(bufp.allocated), 
+			 &num_regs, &flags, &can_be_null,
+			 &(bufp.fastmap), &fastmaplen,
+			 &anchor,
+			 &string, &stringlen, 
+			 &pos))
+	  return NULL;
+
+	/* XXX sanity-check the input data */
+	bufp.used=bufp.allocated;
+	bufp.translate=NULL;
+	bufp.fastmap_accurate=1;
+	bufp.can_be_null=can_be_null;
+	bufp.uses_registers=1;
+	bufp.num_registers=num_regs;
+	bufp.anchor=anchor;
+	
+	for(i=0; i<num_regs; i++) {re_regs.start[i]=-1; re_regs.end[i]=-1;}
+	
+	result = re_match(&bufp, 
+			  string, stringlen, pos, 
+			  &re_regs);
+	if (result < -1) {
+		/* Failure like stack overflow */
+		PyErr_SetString(ReopError, "match failure");
+		return NULL;
+	}
+	return makeresult(&re_regs, num_regs);
+}
+
+static PyObject *
+reop_search(self, args)
+	PyObject *self;
+	PyObject *args;
+{
+	char *string;
+	int fastmaplen, stringlen;
+	int can_be_null, anchor, i;
+	int num_regs, flags, pos, result;
+	struct re_pattern_buffer bufp;
+	struct re_registers re_regs;
+	
+	if (!PyArg_Parse(args, "(s#iiis#is#i)", 
+			 &(bufp.buffer), &(bufp.allocated), 
+			 &num_regs, &flags, &can_be_null,
+			 &(bufp.fastmap), &fastmaplen,
+			 &anchor,
+			 &string, &stringlen, 
+			 &pos))
+	  return NULL;
+
+	/* XXX sanity-check the input data */
+	bufp.used=bufp.allocated;
+	bufp.translate=NULL;
+	bufp.fastmap_accurate=1;
+	bufp.can_be_null=can_be_null;
+	bufp.uses_registers=1;
+	bufp.num_registers=1;
+	bufp.anchor=anchor;
+
+	for(i=0; i<num_regs; i++) {re_regs.start[i]=-1; re_regs.end[i]=-1;}
+	
+	result = re_search(&bufp, 
+			   string, stringlen, pos, stringlen-pos,
+			   &re_regs);
+	if (result < -1) {
+		/* Failure like stack overflow */
+		PyErr_SetString(ReopError, "match failure");
+		return NULL;
+	}
+	return makeresult(&re_regs, num_regs);
+}
+
+#if 0
+/* Functions originally in the regsub module.
+   Added June 1, 1997. 
+   */
+
+/* A cache of previously used patterns is maintained.  Notice that if
+   you change the reop syntax flag, entries in the cache are
+   invalidated.  
+   XXX Solution: use (syntax flag, pattern) as keys?  Clear the cache
+   every so often, or once it gets past a certain size? 
+*/
+
+static PyObject *cache_dict=NULL;
+
+/* Accept an object; if it's a reop pattern, Py_INCREF it and return
+   it.  If it's a string, a reop object is compiled and cached.
+*/
+   
+static reopobject *
+cached_compile(pattern)
+     PyObject *pattern;
+{
+  reopobject *p2;
+
+  if (!PyString_Check(pattern)) 
+    {
+      /* It's not a string, so assume it's a compiled reop object */
+      /* XXX check that! */
+      Py_INCREF(pattern);
+      return (reopobject*)pattern;
+    }
+  if (cache_dict==NULL)
+    {
+      cache_dict=PyDict_New();
+      if (cache_dict==NULL) 
+	{
+	  return (reopobject*)NULL;
+	}
+    }
+
+  /* See if the pattern has already been cached; if so, return that
+     reop object */
+  p2=(reopobject*)PyDict_GetItem(cache_dict, pattern);
+  if (p2)
+    {
+      Py_INCREF(p2);
+      return (reopobject*)p2;
+    }
+
+  /* Compile the pattern and cache it */
+  p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL);
+  if (!p2) return p2;
+  PyDict_SetItem(cache_dict, pattern, (PyObject*)p2);
+  return p2;
+}
+
+
+static PyObject *
+internal_split(args, retain)
+	PyObject *args;
+	int retain;
+{
+  PyObject *newlist, *s;
+  reopobject *pattern;
+  int maxsplit=0, count=0, length, next=0, result;
+  int match_end=0; /* match_start is defined below */
+  char *start;
+
+  if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern,
+			&maxsplit))
+    {
+      PyErr_Clear();
+      if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern))
+	return NULL;
+    }
+  pattern=cached_compile((PyObject *)pattern);
+  if (!pattern) return NULL;
+
+  newlist=PyList_New(0);
+  if (!newlist) return NULL;
+  
+  do
+    {
+      result = re_search(&pattern->re_patbuf, 
+			     start, length, next, length-next,
+			     &pattern->re_regs);
+      if (result < -1)
+	{  /* Erk... an error happened during the reop search */
+	  Py_DECREF(newlist);
+	  PyErr_SetString(ReopError, "match failure");
+	  return NULL;
+	}
+      if (next<=result) 
+	{
+	  int match_start=pattern->re_regs.start[0];
+	  int oldmatch_end=match_end;
+	  match_end=pattern->re_regs.end[0];
+
+	  if (match_start==match_end) 
+	    { /* A zero-length match; increment to the next position */
+	      next=result+1;
+	      match_end=oldmatch_end;
+	      continue;
+	    }
+
+	  /* Append the string up to the start of the match */
+	  s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end);
+	  if (!s) 
+	    {
+	      Py_DECREF(newlist);
+	      return NULL;
+	    }
+	  PyList_Append(newlist, s);
+	  Py_DECREF(s);
+
+	  if (retain)
+	    {
+	      /* Append a string containing whatever matched */
+	      s=PyString_FromStringAndSize(start+match_start, match_end-match_start);
+	      if (!s) 
+		{
+		  Py_DECREF(newlist);
+		  return NULL;
+		}
+	      PyList_Append(newlist, s);
+	      Py_DECREF(s);
+	    }
+	  /* Update the pointer, and increment the count of splits */
+	  next=match_end; count++;
+	}
+    } while (result!=-1 && !(maxsplit && maxsplit==count) &&
+	     next<length);
+  s=PyString_FromStringAndSize(start+match_end, length-match_end);
+  if (!s) 
+    {
+      Py_DECREF(newlist);
+      return NULL;
+    }
+  PyList_Append(newlist, s);
+  Py_DECREF(s);
+  Py_DECREF(pattern);
+  return newlist;
+}
+
+static PyObject *
+reop_split(self, args)
+	PyObject *self;
+	PyObject *args;
+{
+  return internal_split(args, 0);
+}
+
+static PyObject *
+reop_splitx(self, args)
+	PyObject *self;
+	PyObject *args;
+{
+  return internal_split(args, 1);
+}
+#endif
+
+static struct PyMethodDef reop_global_methods[] = {
+	{"match",	reop_match, 0},
+	{"search",	reop_search, 0},
+#if 0
+	{"split",  reop_split, 0},
+	{"splitx",  reop_splitx, 0},
+#endif
+	{NULL,		NULL}		     /* sentinel */
+};
+
+void
+initreop()
+{
+	PyObject *m, *d, *v;
+	int i;
+	char *s;
+	
+	m = Py_InitModule("reop", reop_global_methods);
+	d = PyModule_GetDict(m);
+	
+	/* Initialize reop.error exception */
+	v = ReopError = PyString_FromString("reop.error");
+	if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
+		goto finally;
+	
+	/* Initialize reop.casefold constant */
+	if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
+		goto finally;
+	
+	if (!(s = PyString_AsString(v)))
+		goto finally;
+
+	for (i = 0; i < 256; i++) {
+		if (isupper(i))
+			s[i] = tolower(i);
+		else
+			s[i] = i;
+	}
+	if (PyDict_SetItemString(d, "casefold", v) < 0)
+		goto finally;
+	Py_DECREF(v);
+
+	if (!PyErr_Occurred())
+		return;
+  finally:
+	Py_FatalError("can't initialize reop module");
+}
author	Guido van Rossum <guido@python.org>	1997-07-10 14:31:32 (GMT)
committer	Guido van Rossum <guido@python.org>	1997-07-10 14:31:32 (GMT)
commit	db25f32849730915291993ece2023c1645ff73d2 (patch)
tree	5e9b3dccbc4c005f92f6a4b4820917e7c9a12629 /Modules
parent	db9e20f41802c104d688002af22a090b60a656b4 (diff)
download	cpython-db25f32849730915291993ece2023c1645ff73d2.zip cpython-db25f32849730915291993ece2023c1645ff73d2.tar.gz cpython-db25f32849730915291993ece2023c1645ff73d2.tar.bz2