Initial revision

author: Guido van Rossum <guido@python.org> 1992-01-19 16:32:47 (GMT)
committer: Guido van Rossum <guido@python.org> 1992-01-19 16:32:47 (GMT)
commit: b674c3bba50b1f5ea212dd85366f871872ad0be4 (patch)
tree: 3ef14c83c8ef59d1d1943230fa75b9d6d323fd6c /Modules
parent: 1cab95c14dd56988030a6899ac8934aeab63affc (diff)
download: cpython-b674c3bba50b1f5ea212dd85366f871872ad0be4.zip
cpython-b674c3bba50b1f5ea212dd85366f871872ad0be4.tar.gz
cpython-b674c3bba50b1f5ea212dd85366f871872ad0be4.tar.bz2
2 files changed, 1803 insertions, 0 deletions
diff --git a/Modules/regexpr.c b/Modules/regexpr.c
new file mode 100644
index 0000000..903a6ab
--- /dev/null
+++ b/Modules/regexpr.c
@@ -0,0 +1,1668 @@
+/*
+
+regexpr.c
+
+Author: Tatu Ylonen <ylo@ngs.fi>
+
+Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
+
+Permission to use, copy, modify, distribute, and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that the above copyright notice appear in all copies.  This
+software is provided "as is" without express or implied warranty.
+
+Created: Thu Sep 26 17:14:05 1991 ylo
+Last modified: Mon Nov  4 17:06:48 1991 ylo
+
+This code draws many ideas from the regular expression packages by
+Henry Spencer of the University of Toronto and Richard Stallman of the
+Free Software Foundation.
+
+Emacs-specific code and syntax table code is almost directly borrowed
+from GNU regexp.
+
+$Header$
+
+*/
+
+#include <stdio.h>
+#include <assert.h>
+#include "regexpr.h"
+
+char *malloc();
+void free();
+char *realloc();
+
+#define MACRO_BEGIN do {
+#define MACRO_END } while (0)
+
+enum regexp_compiled_ops /* opcodes for compiled regexp */
+{
+  Cend,			/* end of pattern reached */
+  Cbol,			/* beginning of line */
+  Ceol,			/* end of line */
+  Cset,			/* character set.  Followed by 32 bytes of set. */
+  Cexact,		/* followed by a byte to match */
+  Canychar,		/* matches any character except newline */
+  Cstart_memory,	/* set register start addr (followed by reg number) */
+  Cend_memory,		/* set register end addr (followed by reg number) */
+  Cmatch_memory,	/* match a duplicate of reg contents (regnum follows)*/
+  Cjump,		/* followed by two bytes (lsb,msb) of displacement. */
+  Cstar_jump,		/* will change to jump/update_failure_jump at runtime */
+  Cfailure_jump,	/* jump to addr on failure */
+  Cupdate_failure_jump,	/* update topmost failure point and jump */
+  Cdummy_failure_jump,	/* push a dummy failure point and jump */
+  Cbegbuf,		/* match at beginning of buffer */
+  Cendbuf,		/* match at end of buffer */
+  Cwordbeg,		/* match at beginning of word */
+  Cwordend,		/* match at end of word */
+  Cwordbound,		/* match if at word boundary */
+  Cnotwordbound,	/* match if not at word boundary */
+#ifdef emacs
+  Cemacs_at_dot,	/* emacs only: matches at dot */
+#endif /* emacs */
+  Csyntaxspec,		/* matches syntax code (1 byte follows) */
+  Cnotsyntaxspec	/* matches if syntax code does not match (1 byte foll)*/
+};
+
+enum regexp_syntax_op	/* syntax codes for plain and quoted characters */
+{
+  Rend,			/* special code for end of regexp */
+  Rnormal,		/* normal character */
+  Ranychar,		/* any character except newline */
+  Rquote,		/* the quote character */
+  Rbol,			/* match beginning of line */
+  Reol,			/* match end of line */
+  Roptional,		/* match preceding expression optionally */
+  Rstar,		/* match preceding expr zero or more times */
+  Rplus,		/* match preceding expr one or more times */
+  Ror,			/* match either of alternatives */
+  Ropenpar,		/* opening parenthesis */
+  Rclosepar,		/* closing parenthesis */
+  Rmemory,		/* match memory register */
+  Rextended_memory,	/* \vnn to match registers 10-99 */
+  Ropenset,		/* open set.  Internal syntax hard-coded below. */
+  /* the following are gnu extensions to "normal" regexp syntax */
+  Rbegbuf,		/* beginning of buffer */
+  Rendbuf,		/* end of buffer */
+  Rwordchar,		/* word character */
+  Rnotwordchar,		/* not word character */
+  Rwordbeg,		/* beginning of word */
+  Rwordend,		/* end of word */
+  Rwordbound,		/* word bound */
+  Rnotwordbound,	/* not word bound */
+#ifdef emacs
+  Remacs_at_dot,	/* emacs: at dot */
+  Remacs_syntaxspec,	/* syntaxspec */
+  Remacs_notsyntaxspec,	/* notsyntaxspec */
+#endif /* emacs */
+  Rnum_ops
+};
+
+static int re_compile_initialized = 0;
+static int regexp_syntax = 0;
+static unsigned char regexp_plain_ops[256];
+static unsigned char regexp_quoted_ops[256];
+static unsigned char regexp_precedences[Rnum_ops];
+static int regexp_context_indep_ops;
+static int regexp_ansi_sequences;
+
+#define NUM_LEVELS  5    /* number of precedence levels in use */
+#define MAX_NESTING 100  /* max nesting level of operators */
+
+#ifdef emacs
+
+/* This code is for emacs compatibility only. */
+
+#include "config.h"
+#include "lisp.h"
+#include "buffer.h"
+#include "syntax.h"
+
+/* emacs defines NULL in some strange way? */
+#undef NULL
+#define NULL 0
+
+#else /* emacs */
+
+#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
+#define Sword 1
+
+#ifdef SYNTAX_TABLE
+char *re_syntax_table;
+#else
+static char re_syntax_table[256];
+#endif /* SYNTAX_TABLE */
+
+#endif /* emacs */
+
+static void re_compile_initialize()
+{
+  int a;
+  
+#if !defined(emacs) && !defined(SYNTAX_TABLE)
+  static int syntax_table_inited = 0;
+  
+  if (!syntax_table_inited)
+    {
+      syntax_table_inited = 1;
+      memset(re_syntax_table, 0, 256);
+      for (a = 'a'; a <= 'z'; a++)
+	re_syntax_table[a] = Sword;
+      for (a = 'A'; a <= 'Z'; a++)
+	re_syntax_table[a] = Sword;
+      for (a = '0'; a <= '9'; a++)
+	re_syntax_table[a] = Sword;
+    }
+#endif /* !emacs && !SYNTAX_TABLE */
+  re_compile_initialized = 1;
+  for (a = 0; a < 256; a++)
+    {
+      regexp_plain_ops[a] = Rnormal;
+      regexp_quoted_ops[a] = Rnormal;
+    }
+  for (a = '0'; a <= '9'; a++)
+    regexp_quoted_ops[a] = Rmemory;
+  regexp_plain_ops['\134'] = Rquote;
+  if (regexp_syntax & RE_NO_BK_PARENS)
+    {
+      regexp_plain_ops['('] = Ropenpar;
+      regexp_plain_ops[')'] = Rclosepar;
+    }
+  else
+    {
+      regexp_quoted_ops['('] = Ropenpar;
+      regexp_quoted_ops[')'] = Rclosepar;
+    }
+  if (regexp_syntax & RE_NO_BK_VBAR)
+    regexp_plain_ops['\174'] = Ror;
+  else
+    regexp_quoted_ops['\174'] = Ror;
+  regexp_plain_ops['*'] = Rstar;
+  if (regexp_syntax & RE_BK_PLUS_QM)
+    {
+      regexp_quoted_ops['+'] = Rplus;
+      regexp_quoted_ops['?'] = Roptional;
+    }
+  else
+    {
+      regexp_plain_ops['+'] = Rplus;
+      regexp_plain_ops['?'] = Roptional;
+    }
+  if (regexp_syntax & RE_NEWLINE_OR)
+    regexp_plain_ops['\n'] = Ror;
+  regexp_plain_ops['\133'] = Ropenset;
+  regexp_plain_ops['\136'] = Rbol;
+  regexp_plain_ops['$'] = Reol;
+  regexp_plain_ops['.'] = Ranychar;
+  if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
+    {
+#ifdef emacs
+      regexp_quoted_ops['='] = Remacs_at_dot;
+      regexp_quoted_ops['s'] = Remacs_syntaxspec;
+      regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
+#endif /* emacs */
+      regexp_quoted_ops['w'] = Rwordchar;
+      regexp_quoted_ops['W'] = Rnotwordchar;
+      regexp_quoted_ops['<'] = Rwordbeg;
+      regexp_quoted_ops['>'] = Rwordend;
+      regexp_quoted_ops['b'] = Rwordbound;
+      regexp_quoted_ops['B'] = Rnotwordbound;
+      regexp_quoted_ops['`'] = Rbegbuf;
+      regexp_quoted_ops['\''] = Rendbuf;
+    }
+  if (regexp_syntax & RE_ANSI_HEX)
+    regexp_quoted_ops['v'] = Rextended_memory;
+  for (a = 0; a < Rnum_ops; a++)
+    regexp_precedences[a] = 4;
+  if (regexp_syntax & RE_TIGHT_VBAR)
+    {
+      regexp_precedences[Ror] = 3;
+      regexp_precedences[Rbol] = 2;
+      regexp_precedences[Reol] = 2;
+    }
+  else
+    {
+      regexp_precedences[Ror] = 2;
+      regexp_precedences[Rbol] = 3;
+      regexp_precedences[Reol] = 3;
+    }
+  regexp_precedences[Rclosepar] = 1;
+  regexp_precedences[Rend] = 0;
+  regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0;
+  regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
+}
+
+int re_set_syntax(syntax)
+int syntax;
+{
+  int ret;
+
+  ret = regexp_syntax;
+  regexp_syntax = syntax;
+  re_compile_initialize();
+  return ret;
+}
+
+static int hex_char_to_decimal(ch)
+int ch;
+{
+  if (ch >= '0' && ch <= '9')
+    return ch - '0';
+  if (ch >= 'a' && ch <= 'f')
+    return ch - 'a' + 10;
+  if (ch >= 'A' && ch <= 'F')
+    return ch - 'A' + 10;
+  return 16;
+}
+
+char *re_compile_pattern(regex, size, bufp)
+char *regex;
+int size;
+regexp_t bufp;
+{
+  int a, pos, op, current_level, level, opcode;
+  int pattern_offset, alloc;
+  int starts[NUM_LEVELS * MAX_NESTING], starts_base;
+  int future_jumps[MAX_NESTING], num_jumps;
+  unsigned char ch;
+  char *pattern, *translate;
+  int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
+  int beginning_context;
+
+#define NEXTCHAR(var)			\
+  MACRO_BEGIN				\
+    if (pos >= size)			\
+      goto ends_prematurely;		\
+    (var) = regex[pos];			\
+    pos++;				\
+  MACRO_END
+
+#define ALLOC(amount)				\
+  MACRO_BEGIN					\
+    if (pattern_offset+(amount) > alloc)	\
+      {						\
+	alloc += 256 + (amount);		\
+	pattern = realloc(pattern, alloc);	\
+	if (!pattern)				\
+	  goto out_of_memory;			\
+      }						\
+  MACRO_END
+
+#define STORE(ch) pattern[pattern_offset++] = (ch)
+
+#define CURRENT_LEVEL_START (starts[starts_base + current_level])
+
+#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
+
+#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
+		            starts_base += NUM_LEVELS;			\
+                          else						\
+			    goto too_complex
+
+#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
+
+#define PUT_ADDR(offset,addr)				\
+  MACRO_BEGIN						\
+    int disp = (addr) - (offset) - 2;			\
+    pattern[(offset)] = disp & 0xff;			\
+    pattern[(offset)+1] = (disp>>8) & 0xff;		\
+  MACRO_END
+
+#define INSERT_JUMP(pos,type,addr)			\
+  MACRO_BEGIN						\
+    int a, p = (pos), t = (type), ad = (addr);		\
+    for (a = pattern_offset - 1; a >= p; a--)		\
+      pattern[a + 3] = pattern[a];			\
+    pattern[p] = t;					\
+    PUT_ADDR(p+1,ad);					\
+    pattern_offset += 3;				\
+  MACRO_END
+
+#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
+
+#define SET_FIELDS				\
+  MACRO_BEGIN					\
+    bufp->allocated = alloc;			\
+    bufp->buffer = pattern;			\
+    bufp->used = pattern_offset;		\
+  MACRO_END
+    
+#define GETHEX(var)						\
+  MACRO_BEGIN							\
+    char gethex_ch, gethex_value;				\
+    NEXTCHAR(gethex_ch);					\
+    gethex_value = hex_char_to_decimal(gethex_ch);		\
+    if (gethex_value == 16)					\
+      goto hex_error;						\
+    NEXTCHAR(gethex_ch);					\
+    gethex_ch = hex_char_to_decimal(gethex_ch);			\
+    if (gethex_ch == 16)					\
+      goto hex_error;						\
+    (var) = gethex_value * 16 + gethex_ch;			\
+  MACRO_END
+
+#define ANSI_TRANSLATE(ch)				\
+  MACRO_BEGIN						\
+    switch (ch)						\
+      {							\
+      case 'a':						\
+      case 'A':						\
+	ch = 7; /* audible bell */			\
+	break;						\
+      case 'b':						\
+      case 'B':						\
+	ch = 8; /* backspace */				\
+	break;						\
+      case 'f':						\
+      case 'F':						\
+	ch = 12; /* form feed */			\
+	break;						\
+      case 'n':						\
+      case 'N':						\
+	ch = 10; /* line feed */			\
+	break;						\
+      case 'r':						\
+      case 'R':						\
+	ch = 13; /* carriage return */			\
+	break;						\
+      case 't':						\
+      case 'T':						\
+	ch = 9; /* tab */				\
+	break;						\
+      case 'v':						\
+      case 'V':						\
+	ch = 11; /* vertical tab */			\
+	break;						\
+      case 'x': /* hex code */				\
+      case 'X':						\
+	GETHEX(ch);					\
+	break;						\
+      default:						\
+	/* other characters passed through */		\
+	if (translate)					\
+	  ch = translate[(unsigned char)ch];		\
+	break;						\
+      }							\
+  MACRO_END
+
+  if (!re_compile_initialized)
+    re_compile_initialize();
+  bufp->used = 0;
+  bufp->fastmap_accurate = 0;
+  bufp->uses_registers = 0;
+  translate = bufp->translate;
+  pattern = bufp->buffer;
+  alloc = bufp->allocated;
+  if (alloc == 0 || pattern == NULL)
+    {
+      alloc = 256;
+      pattern = malloc(alloc);
+      if (!pattern)
+	goto out_of_memory;
+    }
+  pattern_offset = 0;
+  starts_base = 0;
+  num_jumps = 0;
+  current_level = 0;
+  SET_LEVEL_START;
+  num_open_registers = 0;
+  next_register = 1;
+  paren_depth = 0;
+  beginning_context = 1;
+  op = -1;
+  /* we use Rend dummy to ensure that pending jumps are updated (due to
+     low priority of Rend) before exiting the loop. */
+  pos = 0;
+  while (op != Rend)
+    {
+      if (pos >= size)
+	op = Rend;
+      else
+	{
+	  NEXTCHAR(ch);
+	  if (translate)
+	    ch = translate[(unsigned char)ch];
+	  op = regexp_plain_ops[(unsigned char)ch];
+	  if (op == Rquote)
+	    {
+	      NEXTCHAR(ch);
+	      op = regexp_quoted_ops[(unsigned char)ch];
+	      if (op == Rnormal && regexp_ansi_sequences)
+		ANSI_TRANSLATE(ch);
+	    }
+	}
+      level = regexp_precedences[op];
+      /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
+	     ch, op, level, current_level, CURRENT_LEVEL_START); */
+      if (level > current_level)
+	{
+	  for (current_level++; current_level < level; current_level++)
+	    SET_LEVEL_START;
+	  SET_LEVEL_START;
+	}
+      else
+	if (level < current_level)
+	  {
+	    current_level = level;
+	    for (;num_jumps > 0 &&
+		 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
+		 num_jumps--)
+	      PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
+	  }
+      switch (op)
+	{
+	case Rend:
+	  break;
+	case Rnormal:
+	normal_char:
+	  opcode = Cexact;
+	store_opcode_and_arg: /* opcode & ch must be set */
+	  SET_LEVEL_START;
+	  ALLOC(2);
+	  STORE(opcode);
+	  STORE(ch);
+	  break;
+	case Ranychar:
+	  opcode = Canychar;
+	store_opcode:
+	  SET_LEVEL_START;
+	  ALLOC(1);
+	  STORE(opcode);
+	  break;
+	case Rquote:
+	  abort();
+	  /*NOTREACHED*/
+	case Rbol:
+	  if (!beginning_context)
+	    if (regexp_context_indep_ops)
+	      goto op_error;
+	    else
+	      goto normal_char;
+	  opcode = Cbol;
+	  goto store_opcode;
+	case Reol:
+	  if (!((pos >= size) ||
+		((regexp_syntax & RE_NO_BK_VBAR) ?
+		 (regex[pos] == '\174') :
+		 (pos+1 < size && regex[pos] == '\134' &&
+		  regex[pos+1] == '\174')) ||
+		((regexp_syntax & RE_NO_BK_PARENS)?
+		 (regex[pos] == ')'):
+		 (pos+1 < size && regex[pos] == '\134' &&
+		  regex[pos+1] == ')'))))
+	    if (regexp_context_indep_ops)
+	      goto op_error;
+	    else
+	      goto normal_char;
+	  opcode = Ceol;
+	  goto store_opcode;
+	  break;
+	case Roptional:
+	  if (beginning_context)
+	    if (regexp_context_indep_ops)
+	      goto op_error;
+	    else
+	      goto normal_char;
+	  if (CURRENT_LEVEL_START == pattern_offset)
+	    break; /* ignore empty patterns for ? */
+	  ALLOC(3);
+	  INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
+		      pattern_offset + 3);
+	  break;
+	case Rstar:
+	case Rplus:
+	  if (beginning_context)
+	    if (regexp_context_indep_ops)
+	      goto op_error;
+	    else
+	      goto normal_char;
+	  if (CURRENT_LEVEL_START == pattern_offset)
+	    break; /* ignore empty patterns for + and * */
+	  ALLOC(9);
+	  INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
+		      pattern_offset + 6);
+	  INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
+	  if (op == Rplus)  /* jump over initial failure_jump */
+	    INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
+			CURRENT_LEVEL_START + 6);
+	  break;
+	case Ror:
+	  ALLOC(6);
+	  INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
+		      pattern_offset + 6);
+	  if (num_jumps >= MAX_NESTING)
+	    goto too_complex;
+	  STORE(Cjump);
+	  future_jumps[num_jumps++] = pattern_offset;
+	  STORE(0);
+	  STORE(0);
+	  SET_LEVEL_START;
+	  break;
+	case Ropenpar:
+	  SET_LEVEL_START;
+	  if (next_register < RE_NREGS)
+	    {
+	      bufp->uses_registers = 1;
+	      ALLOC(2);
+	      STORE(Cstart_memory);
+	      STORE(next_register);
+	      open_registers[num_open_registers++] = next_register;
+	      next_register++;
+	    }
+	  paren_depth++;
+	  PUSH_LEVEL_STARTS;
+	  current_level = 0;
+	  SET_LEVEL_START;
+	  break;
+	case Rclosepar:
+	  if (paren_depth <= 0)
+	    goto parenthesis_error;
+	  POP_LEVEL_STARTS;
+	  current_level = regexp_precedences[Ropenpar];
+	  paren_depth--;
+	  if (paren_depth < num_open_registers)
+	    {
+	      bufp->uses_registers = 1;
+	      ALLOC(2);
+	      STORE(Cend_memory);
+	      num_open_registers--;
+	      STORE(open_registers[num_open_registers]);
+	    }
+	  break;
+	case Rmemory:
+	  if (ch == '0')
+	    goto bad_match_register;
+	  assert(ch >= '0' && ch <= '9');
+	  bufp->uses_registers = 1;
+	  opcode = Cmatch_memory;
+	  ch -= '0';
+	  goto store_opcode_and_arg;
+	case Rextended_memory:
+	  NEXTCHAR(ch);
+	  if (ch < '0' || ch > '9')
+	    goto bad_match_register;
+	  NEXTCHAR(a);
+	  if (a < '0' || a > '9')
+	    goto bad_match_register;
+	  ch = 10 * (a - '0') + ch - '0';
+	  if (ch <= 0 || ch >= RE_NREGS)
+	    goto bad_match_register;
+	  bufp->uses_registers = 1;
+	  opcode = Cmatch_memory;
+	  goto store_opcode_and_arg;
+	case Ropenset:
+	  {
+	    int complement,prev,offset,range,firstchar;
+	    
+	    SET_LEVEL_START;
+	    ALLOC(1+256/8);
+	    STORE(Cset);
+	    offset = pattern_offset;
+	    for (a = 0; a < 256/8; a++)
+	      STORE(0);
+	    NEXTCHAR(ch);
+	    if (translate)
+	      ch = translate[(unsigned char)ch];
+	    if (ch == '\136')
+	      {
+		complement = 1;
+		NEXTCHAR(ch);
+		if (translate)
+		  ch = translate[(unsigned char)ch];
+	      }
+	    else
+	      complement = 0;
+	    prev = -1;
+	    range = 0;
+	    firstchar = 1;
+	    while (ch != '\135' || firstchar)
+	      {
+		firstchar = 0;
+		if (regexp_ansi_sequences && ch == '\134')
+		  {
+		    NEXTCHAR(ch);
+		    ANSI_TRANSLATE(ch);
+		  }
+		if (range)
+		  {
+		    for (a = prev; a <= ch; a++)
+		      SETBIT(pattern, offset, a);
+		    prev = -1;
+		    range = 0;
+		  }
+		else
+		  if (prev != -1 && ch == '-')
+		    range = 1;
+		  else
+		    {
+		      SETBIT(pattern, offset, ch);
+		      prev = ch;
+		    }
+		NEXTCHAR(ch);
+		if (translate)
+		  ch = translate[(unsigned char)ch];
+	      }
+	    if (range)
+	      SETBIT(pattern, offset, '-');
+	    if (complement)
+	      {
+		for (a = 0; a < 256/8; a++)
+		  pattern[offset+a] ^= 0xff;
+	      }
+	    break;
+	  }
+	case Rbegbuf:
+	  opcode = Cbegbuf;
+	  goto store_opcode;
+	case Rendbuf:
+	  opcode = Cendbuf;
+	  goto store_opcode;
+	case Rwordchar:
+	  opcode = Csyntaxspec;
+	  ch = Sword;
+	  goto store_opcode_and_arg;
+	case Rnotwordchar:
+	  opcode = Cnotsyntaxspec;
+	  ch = Sword;
+	  goto store_opcode_and_arg;
+	case Rwordbeg:
+	  opcode = Cwordbeg;
+	  goto store_opcode;
+	case Rwordend:
+	  opcode = Cwordend;
+	  goto store_opcode;
+	case Rwordbound:
+	  opcode = Cwordbound;
+	  goto store_opcode;
+	case Rnotwordbound:
+	  opcode = Cnotwordbound;
+	  goto store_opcode;
+#ifdef emacs
+	case Remacs_at_dot:
+	  opcode = Cemacs_at_dot;
+	  goto store_opcode;
+	case Remacs_syntaxspec:
+	  NEXTCHAR(ch);
+	  if (translate)
+	    ch = translate[(unsigned char)ch];
+	  opcode = Csyntaxspec;
+	  ch = syntax_spec_code[(unsigned char)ch];
+	  goto store_opcode_and_arg;
+	case Remacs_notsyntaxspec:
+	  NEXTCHAR(ch);
+	  if (translate)
+	    ch = translate[(unsigned char)ch];
+	  opcode = Cnotsyntaxspec;
+	  ch = syntax_spec_code[(unsigned char)ch];
+	  goto store_opcode_and_arg;
+#endif /* emacs */
+	default:
+	  abort();
+	}
+      beginning_context = (op == Ropenpar || op == Ror);
+    }
+  if (starts_base != 0)
+    goto parenthesis_error;
+  assert(num_jumps == 0);
+  ALLOC(1);
+  STORE(Cend);
+  SET_FIELDS;
+  return NULL;
+
+ op_error:
+  SET_FIELDS;
+  return "Badly placed special character";
+
+ bad_match_register:
+  SET_FIELDS;
+  return "Bad match register number";
+
+ hex_error:
+  SET_FIELDS;
+  return "Bad hexadecimal number";
+
+ parenthesis_error:
+  SET_FIELDS;
+  return "Badly placed parenthesis";
+
+ out_of_memory:
+  SET_FIELDS;
+  return "Out of memory";
+
+ ends_prematurely:
+  SET_FIELDS;
+  return "Regular expression ends prematurely";
+
+ too_complex:
+  SET_FIELDS;
+  return "Regular expression too complex";
+}
+#undef CHARAT
+#undef NEXTCHAR
+#undef GETHEX
+#undef ALLOC
+#undef STORE
+#undef CURRENT_LEVEL_START
+#undef SET_LEVEL_START
+#undef PUSH_LEVEL_STARTS
+#undef POP_LEVEL_STARTS
+#undef PUT_ADDR
+#undef INSERT_JUMP
+#undef SETBIT
+#undef SET_FIELDS
+
+static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
+char *code, *visited, *can_be_null, *fastmap;
+int pos;
+{
+  int a, b, syntaxcode;
+
+  if (visited[pos])
+    return;  /* we have already been here */
+  visited[pos] = 1;
+  for (;;)
+    switch (code[pos++])
+      {
+      case Cend:
+	*can_be_null = 1;
+	return;
+      case Cbol:
+      case Cbegbuf:
+      case Cendbuf:
+      case Cwordbeg:
+      case Cwordend:
+      case Cwordbound:
+      case Cnotwordbound:
+#ifdef emacs
+      case Cemacs_at_dot:
+#endif /* emacs */
+	break;
+      case Csyntaxspec:
+	syntaxcode = code[pos++];
+	for (a = 0; a < 256; a++)
+	  if (SYNTAX(a) == syntaxcode)
+	    fastmap[a] = 1;
+	return;
+      case Cnotsyntaxspec:
+	syntaxcode = code[pos++];
+	for (a = 0; a < 256; a++)
+	  if (SYNTAX(a) != syntaxcode)
+	    fastmap[a] = 1;
+	return;
+      case Ceol:
+	fastmap['\n'] = 1;
+	if (*can_be_null == 0)
+	  *can_be_null = 2;  /* can match null, but only at end of buffer*/
+	return;
+      case Cset:
+	for (a = 0; a < 256/8; a++)
+	  if (code[pos + a] != 0)
+	    for (b = 0; b < 8; b++)
+	      if (code[pos + a] & (1 << b))
+		fastmap[(a << 3) + b] = 1;
+	pos += 256/8;
+	return;
+      case Cexact:
+	fastmap[(unsigned char)code[pos]] = 1;
+	return;
+      case Canychar:
+	for (a = 0; a < 256; a++)
+	  if (a != '\n')
+	    fastmap[a] = 1;
+	return;
+      case Cstart_memory:
+      case Cend_memory:
+	pos++;
+	break;
+      case Cmatch_memory:
+	/* should this ever happen for sensible patterns??? */
+	*can_be_null = 1;
+	return;
+      case Cjump:
+      case Cdummy_failure_jump:
+      case Cupdate_failure_jump:
+      case Cstar_jump:
+	a = (unsigned char)code[pos++];
+	a |= (unsigned char)code[pos++] << 8;
+	pos += (int)(short)a;
+	if (visited[pos])
+	  {
+	    /* argh... the regexp contains empty loops.  This is not
+	       good, as this may cause a failure stack overflow when
+	       matching.  Oh well. */
+	    /* this path leads nowhere; pursue other paths. */
+	    return;
+	  }
+	visited[pos] = 1;
+	break;
+      case Cfailure_jump:
+	a = (unsigned char)code[pos++];
+	a |= (unsigned char)code[pos++] << 8;
+	a = pos + (int)(short)a;
+	re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
+	break;
+      default:
+	abort();  /* probably some opcode is missing from this switch */
+	/*NOTREACHED*/
+      }
+}
+
+static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
+char *buffer, *fastmap, *can_be_null;
+int used, pos;
+{
+  char small_visited[512], *visited;
+
+  if (used <= sizeof(small_visited))
+    visited = small_visited;
+  else
+    {
+      visited = malloc(used);
+      if (!visited)
+	return 0;
+    }
+  *can_be_null = 0;
+  memset(fastmap, 0, 256);
+  memset(visited, 0, used);
+  re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
+  if (visited != small_visited)
+    free(visited);
+  return 1;
+}
+
+void re_compile_fastmap(bufp)
+regexp_t bufp;
+{
+  if (!bufp->fastmap || bufp->fastmap_accurate)
+    return;
+  assert(bufp->used > 0);
+  if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
+			     bufp->fastmap))
+    return;
+  if (bufp->buffer[0] == Cbol)
+    bufp->anchor = 1;   /* begline */
+  else
+    if (bufp->buffer[0] == Cbegbuf)
+      bufp->anchor = 2; /* begbuf */
+    else
+      bufp->anchor = 0; /* none */
+  bufp->fastmap_accurate = 1;
+}
+
+#define INITIAL_FAILURES  128  /* initial # failure points to allocate */
+#define MAX_FAILURES     4100  /* max # of failure points before failing */
+
+int re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
+regexp_t bufp;
+char *string1, *string2;
+int size1, size2, pos, mstop;
+regexp_registers_t regs;
+{
+  struct failure_point { char *text, *partend, *code; }
+    *failure_stack_start, *failure_sp, *failure_stack_end,
+    initial_failure_stack[INITIAL_FAILURES];
+  char *code, *translate, *text, *textend, *partend, *part_2_end;
+  char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
+  char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
+  int a, b, ch, reg, regch, match_end;
+  char *regtext, *regpartend, *regtextend;
+
+#define PREFETCH					\
+  MACRO_BEGIN						\
+    if (text == partend)				\
+      {							\
+	if (text == textend)				\
+	  goto fail;					\
+	text = string2;					\
+	partend = part_2_end;				\
+      }							\
+  MACRO_END
+
+#define NEXTCHAR(var)				\
+  MACRO_BEGIN					\
+    PREFETCH;					\
+    (var) = (unsigned char)*text++;		\
+    if (translate)				\
+      (var) = (unsigned char)translate[(var)];	\
+  MACRO_END
+
+  assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
+  assert(mstop <= size1 + size2);
+  assert(pos <= mstop);
+
+  if (pos <= size1)
+    {
+      text = string1 + pos;
+      if (mstop <= size1)
+	{
+	  partend = string1 + mstop;
+	  textend = partend;
+	}
+      else
+	{
+	  partend = string1 + size1;
+	  textend = string2 + mstop - size1;
+	}
+      part_2_end = string2 + mstop - size1;
+    }
+  else
+    {
+      text = string2 + pos - size1;
+      partend = string2 + mstop - size1;
+      textend = partend;
+      part_2_end = partend;
+    }
+
+  if (bufp->uses_registers && regs != NULL)
+    for (a = 0; a < RE_NREGS; a++)
+      regend_text[a] = NULL;
+
+  code = bufp->buffer;
+  translate = bufp->translate;
+  failure_stack_start = failure_sp = initial_failure_stack;
+  failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
+
+#if 0
+  /* re_search_2 has already done this, and otherwise we get little benefit
+     from this.  So I'll leave this out. */
+  if (bufp->fastmap_accurate && !bufp->can_be_null &&
+      text != textend &&
+      !bufp->fastmap[translate ?
+		     (unsigned char)translate[(unsigned char)*text] :
+		     (unsigned char)*text])
+    return -1;  /* it can't possibly match */
+#endif
+
+ continue_matching:
+  for (;;)
+    {
+      switch (*code++)
+	{
+	case Cend:
+	  if (partend != part_2_end)
+	    match_end = text - string1;
+	  else
+	    match_end = text - string2 + size1;
+	  if (regs)
+	    {
+	      regs->start[0] = pos;
+	      regs->end[0] = match_end;
+	      if (!bufp->uses_registers)
+		{
+		  for (a = 1; a < RE_NREGS; a++)
+		    {
+		      regs->start[a] = -1;
+		      regs->end[a] = -1;
+		    }
+		}
+	      else
+		{
+		  for (a = 1; a < RE_NREGS; a++)
+		    {
+		      if (regend_text[a] == NULL)
+			{
+			  regs->start[a] = -1;
+			  regs->end[a] = -1;
+			  continue;
+			}
+		      if (regstart_partend[a] != part_2_end)
+			regs->start[a] = regstart_text[a] - string1;
+		      else
+			regs->start[a] = regstart_text[a] - string2 + size1;
+		      if (regend_partend[a] != part_2_end)
+			regs->end[a] = regend_text[a] - string1;
+		      else
+			regs->end[a] = regend_text[a] - string2 + size1;
+		    }
+		}
+	    }
+	  if (failure_stack_start != initial_failure_stack)
+	    free((char *)failure_stack_start);
+	  return match_end - pos;
+	case Cbol:
+	  if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
+	    break;
+	  goto fail;
+	case Ceol:
+	  if (text == string2 + size2 ||
+	      (text == string1 + size1 ?
+	       (size2 == 0 || *string2 == '\n') :
+	       *text == '\n'))
+	    break;
+	  goto fail;
+	case Cset:
+	  NEXTCHAR(ch);
+	  if (code[ch/8] & (1<<(ch & 7)))
+	    {
+	      code += 256/8;
+	      break;
+	    }
+	  goto fail;
+	case Cexact:
+	  NEXTCHAR(ch);
+	  if (ch != (unsigned char)*code++)
+	    goto fail;
+	  break;
+	case Canychar:
+	  NEXTCHAR(ch);
+	  if (ch == '\n')
+	    goto fail;
+	  break;
+	case Cstart_memory:
+	  reg = *code++;
+	  regstart_text[reg] = text;
+	  regstart_partend[reg] = partend;
+	  break;
+	case Cend_memory:
+	  reg = *code++;
+	  regend_text[reg] = text;
+	  regend_partend[reg] = partend;
+	  break;
+	case Cmatch_memory:
+	  reg = *code++;
+	  if (regend_text[reg] == NULL)
+	    goto fail;  /* or should we just match nothing? */
+	  regtext = regstart_text[reg];
+	  regtextend = regend_text[reg];
+	  if (regstart_partend[reg] == regend_partend[reg])
+	    regpartend = regtextend;
+	  else
+	    regpartend = string1 + size1;
+	  
+	  for (;regtext != regtextend;)
+	    {
+	      NEXTCHAR(ch);
+	      if (regtext == regpartend)
+		regtext = string2;
+	      regch = (unsigned char)*regtext++;
+	      if (translate)
+		regch = (unsigned char)translate[regch];
+	      if (regch != ch)
+		goto fail;
+	    }
+	  break;
+	case Cstar_jump:
+	  /* star is coded as:
+	       1: failure_jump 2
+	          ... code for operand of star
+		  star_jump 1
+	       2: ... code after star
+	     We change the star_jump to update_failure_jump if we can determine
+	     that it is safe to do so; otherwise we change it to an ordinary
+	     jump.
+	     plus is coded as
+	          jump 2
+	       1: failure_jump 3
+	       2: ... code for operand of plus
+	          star_jump 1
+	       3: ... code after plus
+	     For star_jump considerations this is processed identically
+	     to star. */
+	  a = (unsigned char)*code++;
+	  a |= (unsigned char)*code++ << 8;
+	  a = (int)(short)a;
+	  {
+	    char map[256], can_be_null;
+	    char *p1, *p2;
+
+	    p1 = code + a + 3; /* skip the failure_jump */
+	    assert(p1[-3] == Cfailure_jump);
+	    p2 = code;
+	    /* p1 points inside loop, p2 points to after loop */
+	    if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
+				       p2 - bufp->buffer, &can_be_null, map))
+	      goto make_normal_jump;
+	    /* If we might introduce a new update point inside the loop,
+	       we can't optimize because then update_jump would update a
+	       wrong failure point.  Thus we have to be quite careful here. */
+	  loop_p1:
+	    /* loop until we find something that consumes a character */
+	    switch (*p1++)
+	      {
+              case Cbol:
+              case Ceol:
+              case Cbegbuf:
+              case Cendbuf:
+              case Cwordbeg:
+              case Cwordend:
+              case Cwordbound:
+              case Cnotwordbound:
+#ifdef emacs
+              case Cemacs_at_dot:
+#endif /* emacs */
+                goto loop_p1;
+              case Cstart_memory:
+              case Cend_memory:
+                p1++;
+                goto loop_p1;
+	      case Cexact:
+		ch = (unsigned char)*p1++;
+		if (map[ch])
+		  goto make_normal_jump;
+		break;
+	      case Canychar:
+		for (b = 0; b < 256; b++)
+		  if (b != '\n' && map[b])
+		    goto make_normal_jump;
+		break;
+	      case Cset:
+		for (b = 0; b < 256; b++)
+		  if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
+		    goto make_normal_jump;
+		p1 += 256/8;
+		break;
+	      default:
+		goto make_normal_jump;
+	      }
+	    /* now we know that we can't backtrack. */
+	    while (p1 != p2 - 3)
+	      {
+		switch (*p1++)
+		  {
+		  case Cend:
+		    abort();  /* we certainly shouldn't get this inside loop */
+		    /*NOTREACHED*/
+		  case Cbol:
+		  case Ceol:
+		  case Canychar:
+		  case Cbegbuf:
+		  case Cendbuf:
+		  case Cwordbeg:
+		  case Cwordend:
+		  case Cwordbound:
+		  case Cnotwordbound:
+#ifdef emacs
+		  case Cemacs_at_dot:
+#endif /* emacs */
+		    break;
+		  case Cset:
+		    p1 += 256/8;
+		    break;
+		  case Cexact:
+		  case Cstart_memory:
+		  case Cend_memory:
+		  case Cmatch_memory:
+		  case Csyntaxspec:
+		  case Cnotsyntaxspec:
+		    p1++;
+		    break;
+		  case Cjump:
+		  case Cstar_jump:
+		  case Cfailure_jump:
+		  case Cupdate_failure_jump:
+		  case Cdummy_failure_jump:
+		    goto make_normal_jump;
+		  default:
+		    printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
+		    break;
+		  }
+	      }
+	    goto make_update_jump;
+	  }
+	make_normal_jump:
+	  /* printf("changing to normal jump\n"); */
+	  code -= 3;
+	  *code = Cjump;
+	  break;
+	make_update_jump:
+	  /* printf("changing to update jump\n"); */
+	  code -= 2;
+	  a += 3;  /* jump to after the Cfailure_jump */
+	  code[-1] = Cupdate_failure_jump;
+	  code[0] = a & 0xff;
+	  code[1] = a >> 8;
+	  /* fall to next case */
+	case Cupdate_failure_jump:
+	  failure_sp[-1].text = text;
+	  failure_sp[-1].partend = partend;
+	  /* fall to next case */
+	case Cjump:
+	  a = (unsigned char)*code++;
+	  a |= (unsigned char)*code++ << 8;
+	  code += (int)(short)a;
+	  break;
+	case Cdummy_failure_jump:
+	case Cfailure_jump:
+	  if (failure_sp == failure_stack_end)
+	    {
+	      if (failure_stack_start != initial_failure_stack)
+		goto error;
+	      failure_stack_start = (struct failure_point *)
+		malloc(MAX_FAILURES * sizeof(*failure_stack_start));
+	      failure_stack_end = failure_stack_start + MAX_FAILURES;
+	      memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
+		     INITIAL_FAILURES * sizeof(*failure_stack_start));
+	      failure_sp = failure_stack_start + INITIAL_FAILURES;
+	    }
+	  a = (unsigned char)*code++;
+	  a |= (unsigned char)*code++ << 8;
+	  a = (int)(short)a;
+	  if (code[-3] == Cdummy_failure_jump)
+	    { /* this is only used in plus */
+	      assert(*code == Cfailure_jump);
+	      b = (unsigned char)code[1];
+	      b |= (unsigned char)code[2] << 8;
+	      failure_sp->code = code + (int)(short)b + 3;
+	      failure_sp->text = NULL;
+	      code += a;
+	    }
+	  else
+	    {
+	      failure_sp->code = code + a;
+	      failure_sp->text = text;
+	      failure_sp->partend = partend;
+	    }
+	  failure_sp++;
+	  break;
+	case Cbegbuf:
+	  if (text == string1)
+	    break;
+	  goto fail;
+	case Cendbuf:
+	  if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
+	    break;
+	  goto fail;
+	case Cwordbeg:
+	  if (text == string2 + size2)
+	    goto fail;
+	  if (size2 == 0 && text == string1 + size1)
+	    goto fail;
+	  if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
+	    goto fail;
+	  if (text == string1)
+	    break;
+	  if (SYNTAX(text[-1]) != Sword)
+	    break;
+	  goto fail;
+	case Cwordend:
+	  if (text == string1)
+	    goto fail;
+	  if (SYNTAX(text[-1]) != Sword)
+	    goto fail;
+	  if (text == string2 + size2)
+	    break;
+	  if (size2 == 0 && text == string1 + size1)
+	    break;
+	  if (SYNTAX(*text) == Sword)
+	    goto fail;
+	  break;
+	case Cwordbound:
+	  /* Note: as in gnu regexp, this also matches at the beginning
+	     and end of buffer. */
+	  if (text == string1 || text == string2 + size2 ||
+	      (size2 == 0 && text == string1 + size1))
+	    break;
+	  if ((SYNTAX(text[-1]) == Sword) ^
+	      (SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
+	    break;
+	  goto fail;
+	case Cnotwordbound:
+	  /* Note: as in gnu regexp, this never matches at the beginning
+	     and end of buffer. */
+	  if (text == string1 || text == string2 + size2 ||
+	      (size2 == 0 && text == string1 + size1))
+	    goto fail;
+	  if (!((SYNTAX(text[-1]) == Sword) ^
+		(SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
+	    goto fail;
+	  break;
+	case Csyntaxspec:
+	  NEXTCHAR(ch);
+	  if (SYNTAX(ch) != (unsigned char)*code++)
+	    goto fail;
+	  break;
+	case Cnotsyntaxspec:
+	  NEXTCHAR(ch);
+	  if (SYNTAX(ch) != (unsigned char)*code++)
+	    break;
+	  goto fail;
+#ifdef emacs
+	case Cemacs_at_dot:
+	  if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
+	    goto fail;
+	  break;
+#endif /* emacs */
+	default:
+	  abort();
+	  /*NOTREACHED*/
+	}
+    }
+  abort();
+  /*NOTREACHED*/
+
+ fail:
+  if (failure_sp != failure_stack_start)
+    {
+      failure_sp--;
+      text = failure_sp->text;
+      if (text == NULL)
+	goto fail;
+      partend = failure_sp->partend;
+      code = failure_sp->code;
+      goto continue_matching;
+    }
+  if (failure_stack_start != initial_failure_stack)
+    free((char *)failure_stack_start);
+  return -1;
+
+ error:
+  if (failure_stack_start != initial_failure_stack)
+    free((char *)failure_stack_start);
+  return -2;
+}
+
+#undef PREFETCH
+#undef NEXTCHAR
+#undef PUSH_FAILURE
+
+int re_match(bufp, string, size, pos, regs)
+regexp_t bufp;
+char *string;
+int size, pos;
+regexp_registers_t regs;
+{
+  return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
+}
+
+int re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
+		mstop)
+regexp_t bufp;
+char *string1, *string2;
+int size1, size2, pos, range, mstop;
+regexp_registers_t regs;
+{
+  char *fastmap, *translate, *text, *partstart, *partend;
+  int dir, ret;
+  char anchor;
+  
+  assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
+  assert(pos + range + 1 >= 0 && pos + range - 1 <= size1 + size2);
+  assert(pos <= mstop);
+  
+  fastmap = bufp->fastmap;
+  translate = bufp->translate;
+  if (fastmap && !bufp->fastmap_accurate)
+    re_compile_fastmap(bufp);
+  anchor = bufp->anchor;
+  if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
+    fastmap = NULL;
+  if (range < 0)
+    {
+      dir = -1;
+      range = -range;
+    }
+  else
+    dir = 1;
+  if (anchor == 2)
+    if (pos != 0)
+      return -1;
+    else
+      range = 0;
+  for (; range >= 0; range--, pos += dir)
+    {
+      if (fastmap)
+	{
+	  if (dir == 1)
+	    { /* searching forwards */
+	      if (pos < size1)
+		{
+		  text = string1 + pos;
+		  if (pos + range > size1)
+		    partend = string1 + size1;
+		  else
+		    partend = string1 + pos + range;
+		}
+	      else
+		{
+		  text = string2 + pos - size1;
+		  partend = string2 + pos + range - size1;
+		}
+	      partstart = text;
+	      if (translate)
+		while (text != partend &&
+		       !fastmap[(unsigned char)
+				translate[(unsigned char)*text]])
+		  text++;
+	      else
+		while (text != partend && !fastmap[(unsigned char)*text])
+		  text++;
+	      pos += text - partstart;
+	      range -= text - partstart;
+	      if (pos == size1 + size2 && bufp->can_be_null == 0)
+		return -1;
+	    }
+	  else
+	    { /* searching backwards */
+	      if (pos <= size1)
+		{
+		  text = string1 + pos;
+		  partstart = string1 + pos - range;
+		}
+	      else
+		{
+		  text = string2 + pos - size1;
+		  if (range < pos - size1)
+		    partstart = string2 + pos - size1 - range;
+		  else
+		    partstart = string2;
+		}
+	      partend = text;
+	      if (translate)
+		while (text != partstart &&
+		       !fastmap[(unsigned char)
+				translate[(unsigned char)*text]])
+		  text--;
+	      else
+		while (text != partstart &&
+		       !fastmap[(unsigned char)*text])
+		  text--;
+	      pos -= partend - text;
+	      range -= partend - text;
+	    }
+	}
+      if (anchor == 1)
+	{ /* anchored to begline */
+	  if (pos > 0 &&
+	      (pos <= size1 ? string1[pos - 1] :
+	       string2[pos - size1 - 1]) != '\n')
+	    continue;
+	}
+      assert(pos >= 0 && pos <= size1 + size2);
+      ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
+      if (ret >= 0)
+	return pos;
+      if (ret == -2)
+	return -2;
+    }
+  return -1;
+}
+
+int re_search(bufp, string, size, startpos, range, regs)
+regexp_t bufp;
+char *string;
+int size, startpos, range;
+regexp_registers_t regs;
+{
+  return re_search_2(bufp, string, size, (char *)NULL, 0,
+		     startpos, range, regs, size);
+}
+
+static struct re_pattern_buffer re_comp_buf;
+
+char *re_comp(s)
+char *s;
+{
+  if (s == NULL)
+    {
+      if (!re_comp_buf.buffer)
+	return "Out of memory";
+      return NULL;
+    }
+  if (!re_comp_buf.buffer)
+    {
+      /* the buffer will be allocated automatically */
+      re_comp_buf.fastmap = malloc(256);
+      re_comp_buf.translate = NULL;
+    }
+  return re_compile_pattern(s, strlen(s), &re_comp_buf);
+}
+
+int re_exec(s)
+char *s;
+{
+  int len = strlen(s);
+  
+  return re_search(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
+}
+
+#ifdef TEST_REGEXP
+
+int main()
+{
+  char buf[500];
+  char *cp;
+  struct re_pattern_buffer exp;
+  struct re_registers regs;
+  int a,pos;
+  char fastmap[256];
+
+  exp.allocated = 0;
+  exp.buffer = 0;
+  exp.translate = NULL;
+  exp.fastmap = fastmap;
+
+  /* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
+
+  while (1)
+    {
+      printf("Enter regexp:\n");
+      gets(buf);
+      cp=re_compile_pattern(buf, strlen(buf), &exp);
+      if (cp)
+	{
+	  printf("Error: %s\n", cp);
+	  continue;
+	}
+      re_compile_fastmap(&exp);
+      printf("dump:\n");
+      for (pos = 0; pos < exp.used;)
+	{
+	  printf("%d: ", pos);
+	  switch (exp.buffer[pos++])
+	    {
+	    case Cend:
+	      strcpy(buf, "end");
+	      break;
+	    case Cbol:
+	      strcpy(buf, "bol");
+	      break;
+	    case Ceol:
+	      strcpy(buf, "eol");
+	      break;
+	    case Cset:
+	      strcpy(buf, "set ");
+	      for (a = 0; a < 256/8; a++)
+		sprintf(buf+strlen(buf)," %02x",
+			(unsigned char)exp.buffer[pos++]);
+	      break;
+	    case Cexact:
+	      sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
+		      (unsigned char)exp.buffer[pos]);
+	      pos++;
+	      break;
+	    case Canychar:
+	      strcpy(buf, "anychar");
+	      break;
+	    case Cstart_memory:
+	      sprintf(buf, "start_memory %d", exp.buffer[pos++]);
+	      break;
+	    case Cend_memory:
+	      sprintf(buf, "end_memory %d", exp.buffer[pos++]);
+	      break;
+	    case Cmatch_memory:
+	      sprintf(buf, "match_memory %d", exp.buffer[pos++]);
+	      break;
+	    case Cjump:
+	    case Cdummy_failure_jump:
+	    case Cstar_jump:
+	    case Cfailure_jump:
+	    case Cupdate_failure_jump:
+	      a = (unsigned char)exp.buffer[pos++];
+	      a += (unsigned char)exp.buffer[pos++] << 8;
+	      a = (int)(short)a;
+	      switch (exp.buffer[pos-3])
+		{
+		case Cjump:
+		  cp = "jump";
+		  break;
+		case Cstar_jump:
+		  cp = "star_jump";
+		  break;
+		case Cfailure_jump:
+		  cp = "failure_jump";
+		  break;
+		case Cupdate_failure_jump:
+		  cp = "update_failure_jump";
+		  break;
+		case Cdummy_failure_jump:
+		  cp = "dummy_failure_jump";
+		  break;
+		default:
+		  cp = "unknown jump";
+		  break;
+		}
+	      sprintf(buf, "%s %d", cp, a + pos);
+	      break;
+	    case Cbegbuf:
+	      strcpy(buf,"begbuf");
+	      break;
+	    case Cendbuf:
+	      strcpy(buf,"endbuf");
+	      break;
+	    case Cwordbeg:
+	      strcpy(buf,"wordbeg");
+	      break;
+	    case Cwordend:
+	      strcpy(buf,"wordend");
+	      break;
+	    case Cwordbound:
+	      strcpy(buf,"wordbound");
+	      break;
+	    case Cnotwordbound:
+	      strcpy(buf,"notwordbound");
+	      break;
+	    default:
+	      sprintf(buf, "unknown code %d",
+		      (unsigned char)exp.buffer[pos - 1]);
+	      break;
+	    }
+	  printf("%s\n", buf);
+	}
+      printf("can_be_null = %d uses_registers = %d anchor = %d\n",
+	     exp.can_be_null, exp.uses_registers, exp.anchor);
+      
+      printf("fastmap:");
+      for (a = 0; a < 256; a++)
+	if (exp.fastmap[a])
+	  printf(" %d", a);
+      printf("\n");
+      printf("Enter strings.  An empty line terminates.\n");
+      while (fgets(buf, sizeof(buf), stdin))
+	{
+	  if (buf[0] == '\n')
+	    break;
+	  a = re_search(&exp, buf, strlen(buf), 0, strlen(buf), &regs);
+	  printf("search returns %d\n", a);
+	  if (a != -1)
+	    {
+	      for (a = 0; a < RE_NREGS; a++)
+		{
+		  printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
+		}
+	    }
+	}
+    }
+}
+
+#endif /* TEST_REGEXP */
diff --git a/Modules/regexpr.h b/Modules/regexpr.h
new file mode 100644
index 0000000..8c5a2ac
--- /dev/null
+++ b/Modules/regexpr.h
@@ -0,0 +1,135 @@
+/*
+
+regexpr.h
+
+Author: Tatu Ylonen <ylo@ngs.fi>
+
+Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
+
+Permission to use, copy, modify, distribute, and sell this software
+and its documentation for any purpose is hereby granted without fee,
+provided that the above copyright notice appear in all copies.  This
+software is provided "as is" without express or implied warranty.
+
+Created: Thu Sep 26 17:15:36 1991 ylo
+Last modified: Mon Nov  4 15:49:46 1991 ylo
+
+*/
+
+#ifndef REGEXPR_H
+#define REGEXPR_H
+
+#define RE_NREGS	10  /* number of registers available */
+
+typedef struct re_pattern_buffer
+{
+  char *buffer; 	 /* compiled pattern */
+  int allocated;	 /* allocated size of compiled pattern */
+  int used;		 /* actual length of compiled pattern */
+  char *fastmap;	 /* fastmap[ch] is true if ch can start pattern */
+  char *translate;	 /* translation to apply during compilation/matching */
+  char fastmap_accurate; /* true if fastmap is valid */
+  char can_be_null;	 /* true if can match empty string */
+  char uses_registers;	 /* registers are used and need to be initialized */
+  char anchor;		 /* anchor: 0=none 1=begline 2=begbuf */
+} *regexp_t;
+
+typedef struct re_registers
+{
+  int start[RE_NREGS];  /* start offset of region */
+  int end[RE_NREGS];    /* end offset of region */
+} *regexp_registers_t;
+
+/* bit definitions for syntax */
+#define RE_NO_BK_PARENS		1    /* no quoting for parentheses */
+#define RE_NO_BK_VBAR		2    /* no quoting for vertical bar */
+#define RE_BK_PLUS_QM		4    /* quoting needed for + and ? */
+#define RE_TIGHT_VBAR		8    /* | binds tighter than ^ and $ */
+#define RE_NEWLINE_OR		16   /* treat newline as or */
+#define RE_CONTEXT_INDEP_OPS	32   /* ^$?*+ are special in all contexts */
+#define RE_ANSI_HEX		64   /* ansi sequences (\n etc) and \xhh */
+#define RE_NO_GNU_EXTENSIONS   128   /* no gnu extensions */
+
+/* definitions for some common regexp styles */
+#define RE_SYNTAX_AWK	(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_CONTEXT_INDEP_OPS)
+#define RE_SYNTAX_EGREP	(RE_SYNTAX_AWK|RE_NEWLINE_OR)
+#define RE_SYNTAX_GREP	(RE_BK_PLUS_QM|RE_NEWLINE_OR)
+#define RE_SYNTAX_EMACS	0
+
+#ifdef __STDC__
+
+int re_set_syntax(int syntax);
+/* This sets the syntax to use and returns the previous syntax.  The
+   syntax is specified by a bit mask of the above defined bits. */
+
+char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled);
+/* This compiles the regexp (given in regex and length in regex_size).
+   This returns NULL if the regexp compiled successfully, and an error
+   message if an error was encountered.  The buffer field must be
+   initialized to a memory area allocated by malloc (or to NULL) before
+   use, and the allocated field must be set to its length (or 0 if buffer is
+   NULL).  Also, the translate field must be set to point to a valid
+   translation table, or NULL if it is not used. */
+
+int re_match(regexp_t compiled, char *string, int size, int pos,
+	     regexp_registers_t regs);
+/* This tries to match the regexp against the string.  This returns the
+   length of the matched portion, or -1 if the pattern could not be
+   matched and -2 if an error (such as failure stack overflow) is
+   encountered. */
+
+int re_match_2(regexp_t compiled, char *string1, int size1,
+	      char *string2, int size2, int pos, regexp_registers_t regs,
+	       int mstop);
+/* This tries to match the regexp to the concatenation of string1 and
+   string2.  This returns the length of the matched portion, or -1 if the
+   pattern could not be matched and -2 if an error (such as failure stack
+   overflow) is encountered. */
+
+int re_search(regexp_t compiled, char *string, int size, int startpos,
+	      int range, regexp_registers_t regs);
+/* This rearches for a substring matching the regexp.  This returns the first
+   index at which a match is found.  range specifies at how many positions to
+   try matching; positive values indicate searching forwards, and negative
+   values indicate searching backwards.  mstop specifies the offset beyond
+   which a match must not go.  This returns -1 if no match is found, and
+   -2 if an error (such as failure stack overflow) is encountered. */
+
+int re_search_2(regexp_t compiled, char *string1, int size1,
+		char *string2, int size2, int startpos, int range,
+		regexp_registers_t regs, int mstop);
+/* This is like re_search, but search from the concatenation of string1 and
+   string2.  */
+
+void re_compile_fastmap(regexp_t compiled);
+/* This computes the fastmap for the regexp.  For this to have any effect,
+   the calling program must have initialized the fastmap field to point
+   to an array of 256 characters. */
+
+char *re_comp(char *s);
+/* BSD 4.2 regex library routine re_comp.  This compiles the regexp into
+   an internal buffer.  This returns NULL if the regexp was compiled
+   successfully, and an error message if there was an error. */
+
+int re_exec(char *s);
+/* BSD 4.2 regexp library routine re_exec.  This returns true if the string
+   matches the regular expression (that is, a matching part is found
+   anywhere in the string). */
+
+#else /* __STDC__ */
+
+int re_set_syntax();
+char *re_compile_pattern();
+int re_match();
+int re_match_2();
+int re_search();
+int re_search_2();
+void re_compile_fastmap();
+char *re_comp();
+int re_exec();
+
+#endif /* __STDC__ */
+
+#endif /* REGEXPR_H */
+
+
author	Guido van Rossum <guido@python.org>	1992-01-19 16:32:47 (GMT)
committer	Guido van Rossum <guido@python.org>	1992-01-19 16:32:47 (GMT)
commit	b674c3bba50b1f5ea212dd85366f871872ad0be4 (patch)
tree	3ef14c83c8ef59d1d1943230fa75b9d6d323fd6c /Modules
parent	1cab95c14dd56988030a6899ac8934aeab63affc (diff)
download	cpython-b674c3bba50b1f5ea212dd85366f871872ad0be4.zip cpython-b674c3bba50b1f5ea212dd85366f871872ad0be4.tar.gz cpython-b674c3bba50b1f5ea212dd85366f871872ad0be4.tar.bz2