// RegEx RISC OS module source

/* Extended regular expression matching and search library,
   version 0.12.
   (Implements POSIX draft P10003.2/D11.2, except for
   internationalization features.)

   Copyright (C) 1993 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "kernel.h"
#include "regex.h"
#include "config.h"


// Defines

/* If this bit is set, then the expression will be converted in it's
   entirety to be case-insensitive - for SWI RegEx_CompileExtendedPattern
   ONLY. An error will be generated if this bit is set in CompilePattern.
   When this bit is set, the extensions \i & \I are ignored.  */
#define RE_CASE_INSENSITIVE (RE_UNMATCHED_RIGHT_PAREN_ORD << 1)

#if RE_CASE_INSENSITIVE != (1 << 18)
#error GNU regex does not seem to be version 0.12
#endif

// Default syntax
#define FAV_SYNTAX (  \
  RE_BACKSLASH_ESCAPE_IN_LISTS |  \
  RE_CHAR_CLASSES |  \
  RE_DOT_NOT_NULL |  \
  RE_HAT_LISTS_NOT_NEWLINE |  \
  RE_INTERVALS |  \
  RE_NO_BK_BRACES |  \
  RE_NO_BK_PARENS |  \
  RE_NO_BK_VBAR |  \
  RE_NO_EMPTY_RANGES |  \
  RE_UNMATCHED_RIGHT_PAREN_ORD )

#define R0 (r->r[0])
#define R1 (r->r[1])
#define R2 (r->r[2])
#define R3 (r->r[3])
#define R4 (r->r[4])
#define R5 (r->r[5])
#define R6 (r->r[6])
#define R7 (r->r[7])
#define R8 (r->r[8])
#define R9 (r->r[9])

#define OSERR(n) (&(RegExErrors[(n)]))

#define UNUSED(x) (x = x)

// Must conform with macro definition from c.RegEx
#define BYTEWIDTH 8

// Types
typedef enum
{
  RegEx_CompilePattern,
  RegEx_CompileFastmap,
  RegEx_Search,
  RegEx_Search2,
  RegEx_Match,
  RegEx_Match2,
  RegEx_SetRegisters,
  RegEx_Free,
  RegEx_PrintCompiledPattern,
  RegEx_CompileExtendedPattern,
  RegEx_GetDefaultSyntax,
  RegEx_SetDefaultSyntax
} SWI_list;


struct regexbuf
{
  regex_t             *expr;
  struct re_registers *regs;
  int                  user_regs; /* regs are allocated by user */
  struct regexbuf     *prev;
  struct regexbuf     *next;
};

typedef struct regexbuf *RegExBuf;


// Error codes
// static const _kernel_oserror InternalError =
//    { 0x816500, "unknown internal error" };

static const _kernel_oserror RegExErrors[] =
{
  {        0, NULL },                                    // REG_NOERROR
  { 0x816501, "No match" },                              // REG_NOMATCH
  { 0x816502, "Invalid regular expression" },            // REG_BADPAT
  { 0x816503, "Invalid collation character" },           // REG_ECOLLATE
  { 0x816504, "Invalid character class name" },          // REG_ECTYPE
  { 0x816505, "Trailing backslash" },                    // REG_EESCAPE
  { 0x816506, "Invalid back reference" },                // REG_ESUBREG
  { 0x816507, "Unmatched [ or [^" },                     // REG_EBRACK
  { 0x816508, "Unmatched ( or \\(" },                    // REG_EPAREN
  { 0x816509, "Unmatched \\{" },                         // REG_EBRACE
  { 0x81650A, "Invalid content of \\{\\}" },             // REG_BADBR
  { 0x81650B, "Invalid range end" },                     // REG_ERANGE
  { 0x81650C, "Memory exhausted" },                      // REG_ESPACE
  { 0x81650D, "Invalid preceding regular expression" },  // REG_BADRPT
  { 0x81650E, "Premature end of regular expression" },   // REG_EEND
  { 0x81650F, "Regular expression too big" },            // REG_ESIZE
  { 0x816510, "Unmatched ) or \\)" },                    // REG_ERPAREN
};

static const _kernel_oserror DuffHandleError =
  { 0x816540, "Invalid RegEx handle" };
static const _kernel_oserror UndefinedSWIError =
  { 0x816541, "Unknown RegEx SWI" };
static const _kernel_oserror GNUInternalError =
  { 0x816542, "GNU-regex internal error" };
#ifndef DEBUG
static const _kernel_oserror NoDebugError =
  { 0x816543, "Regex not built with debug in" };
#endif
static const _kernel_oserror CaseInsensitiveError =
  { 0x816544, "Case-insensitive syntax not allowed" };
static const _kernel_oserror ExtensionFailedError =
  { 0x816545, "Could not created extended expression" };



// Global variables. Is this allowed in modules??
static RegExBuf               buffers;
static const _kernel_oserror *error;
static int                    default_syntax = FAV_SYNTAX;
static char                  *new_expr = NULL; // globally reused buffer
static int                    new_expr_size = 0;



// Ref. to not-normally-exported GNU routine (to get error number)
extern reg_errcode_t regex_compile(
  const char *pattern,
  int size,
  reg_syntax_t syntax,
  struct re_pattern_buffer *bufp);

#ifdef DEBUG
// Ref. to not-normally-exported GNU routine (for debug)
extern void print_compiled_pattern(struct re_pattern_buffer *bufp);
#endif


// Startup
const _kernel_oserror *
initialisation_code(char *cmd_tail, int podule_base, void *pw)
{
  buffers = NULL;
  return NULL;
}


// Clear a buffer
static void
free_buffer(RegExBuf buffer)
{
  // Maybe clear regs
  if (!buffer->user_regs)
  {
    // Not user-specified regs, so free 'em
    if (buffer->regs->start)
    {
      free(buffer->regs->start);
    }
    if (buffer->regs->end)
    {
      free(buffer->regs->end);
    }
  }
  free(buffer->regs);

  // Get rid of expr and regs
  regfree(buffer->expr);
  free(buffer->expr);

  // Remove buffer from list
  if (buffer->prev)
  {
    // This is not the first buffer of the chain ...
    buffer->prev->next = buffer->next;
  }
  if (buffer->next)
  {
    // This is not the last buffer of the chain ...
    buffer->next->prev = buffer->prev;
  }
  if (buffers == buffer)
  {
    // We are at the first buffer.
    buffers = buffer->next;
  }
  free(buffer);
}


// Clean up
const _kernel_oserror *
finalisation_code(int fatal, int podule_base, void *pw)
{
  // Free all buffers that have not been freed
  // by the clients before exiting.
  while (buffers)
  {
     free_buffer(buffers);
  }

  return NULL;
}


// From GNU re_compile_pattern() - but returning errno not errstr.
// Pity we have to clone that code, but we want to be able to issue
// nice RISC OS error messages with error numbers and not just errstr.
static reg_errcode_t
compile_pattern(
  const char               *pattern,
  int                       length,
  reg_syntax_t              syntax,
  struct re_pattern_buffer *bufp)
{
  // GNU code is written to assume at least RE_NREGS registers will be set
  // (and at least one extra will be -1).
  bufp->regs_allocated = REGS_UNALLOCATED;

  // And GNU code determines whether or not to get register information
  // by passing null for the REGS argument to re_match, etc., not by
  // setting no_sub.
  bufp->no_sub = 0;

  // Match anchors at newline.
  bufp->newline_anchor = 1;

  return regex_compile(pattern, length, syntax, bufp);
}


// Check whether given buffer is a known one.
static RegExBuf
check_buffer(RegExBuf buffer)
{
  RegExBuf iter = buffers;
  while (iter)
  {
    if (iter == buffer)
    {
      return buffer;
    }
    iter = iter->next;
  }

  error = &DuffHandleError;
  return (RegExBuf) -1;
}


// Try to create a new buffer.
static RegExBuf
new_buffer(void)
{
  RegExBuf buffer = malloc(sizeof(struct regexbuf));
  if (buffer)
  {
    memset(buffer, 0, sizeof(struct regexbuf));
  }
  else
  {
    error = OSERR(REG_ESPACE);
    return (RegExBuf) -1;
  }

  buffer->expr = malloc(sizeof(regex_t));
  if (buffer->expr)
  {
    memset(buffer->expr, 0, sizeof(regex_t));
  }
  else
  {
    free(buffer);
    error = OSERR(REG_ESPACE);
    return (RegExBuf) -1;
  }

  buffer->regs = malloc(sizeof(struct re_registers));
  if (buffer->regs)
  {
    memset(buffer->regs, 0, sizeof(struct re_registers));
  }
  else
  {
    free(buffer->expr);
    free(buffer);
    error = OSERR(REG_ESPACE);
    return (RegExBuf) -1;
  }

  buffer->user_regs = 0;

  // Hang into buffer list
  buffer->prev = NULL;
  buffer->next = buffers;
  if (buffers)
  {
    buffers->prev = buffer;
  }
  buffers = buffer;

  return buffer;
}


// Add STR to string pointed to by PTR, incrementing PTR
static void
strext(char **ptr, char *str)
{
  while (*str)
    *((*ptr)++) = *(str++);
  **ptr = '\0';
}


// Convert input EXPR into NEW_EXPR, parsing extra facilties
// Return NEW_EXPR length in DEST_LEN
static int
extend_pattern(char *expr, int *dest_len, int global_case_insensitive)
{
  char *ps, *pd;
  int   escaped = 0;
  int   in_set = 0;
  int   case_insensitive = 0;

  // Init. new_expr - seems to hang if done at Init.
  if (!new_expr)
  {
    new_expr_size = 128;
    new_expr = malloc(new_expr_size);
    if (!new_expr)
    {
      error = OSERR(REG_ESPACE);
      return 0;
    }
  }

  // Run through source expression
  for (pd = new_expr, ps = expr; pd && *ps; ps++)
  {
    // Lengthen new_expr? Use a safety margin of 16 bytes because
    // the extended replacement can be up to 13 bytes long.
    if (pd - new_expr >= new_expr_size - 16)
    {
      // Yup
      char *new_ptr;

      new_expr_size *= 2;
      new_ptr = realloc(new_expr, new_expr_size);

      if (!new_ptr)
      {
        new_expr_size /= 2;
        error = OSERR(REG_ESPACE);
        return 0;
      }
      else
      {
        // If realloc was successful and the buffer moved from new_expr
        // to new_ptr, then pd should also move by the difference of
        // the moved buffers.  Up until 1.04 this was not done and long
        // regexps therefore crashed.  This bug was noticed and fixed by
        // Martin Avison on 07/11/2006 and incoroporated in v1.05.
        pd += new_ptr - new_expr;
        new_expr = new_ptr;
      }
    }

    // Just had valid back-slash?
    if (escaped)
    {
      // Yup
      escaped = 0;

      // See if we interpret the next character
      switch (*ps)
      {
        case 'a' :  /* "alphanumeric */
          strext(&pd, "[A-Za-z0-9]");
          break;

        case 'A' :  /* Not alphanumeric */
          strext(&pd, "[^A-Za-z0-9]");
          break;

        case 'd' :  /* Digit */
          strext(&pd, "[0-9]");
          break;

        case 'D' :  /* Not digit */
          strext(&pd, "[^0-9]");
          break;

        case 'f' :  /* Float */
          strext(&pd, "[0-9.]");
          break;

        case 'F' :  /* Not float */
          strext(&pd, "[^0-9.]");
          break;

        case 'h' :  /* Hex digit */
          strext(&pd, "[0-9A-Fa-f]");
          break;

        case 'H' :  /* Not hex digit */
          strext(&pd, "[^0-9A-Fa-f]");
          break;

        case 'i' :  /* start case-insensitive */
          case_insensitive++;
          break;

        case 'I' :  /* stop case-insensitive */
          if (case_insensitive)
            case_insensitive--;
          break;

        case 'l' :  /* lower-case Id */
          strext(&pd, "[a-z0-9_]");
          break;

        case 'L' :  /* Not lower-case Id */
          strext(&pd, "[^a-z0-9_]");
          break;

        case 'n' :  /* new-line (10) */
          *(pd++) = '\012';
          break;

        case 'N' :  /* Not new-line (10) */
          strext(&pd, "[^\012]");
          break;

        case 'r' :  /* return (13) */
          *(pd++) = '\015';
          break;

        case 'R' :  /* Not return (13) */
          strext(&pd, "[^\015]");
          break;

        case 't' :  /* tab (9) */
          *(pd++) = '\011';
          break;

        case 'T' :  /* Not tab (9) */
          strext(&pd, "[^\011]");
          break;

        case 'u' :  /* upper-case Id */
          strext(&pd, "[A-Z0-9_]");
          break;

        case 'U' :  /* Not upper-case Id */
          strext(&pd, "[^A-Z0-9_]");
          break;

        case 'w' :  /* "word" char. */
          strext(&pd, "[A-Za-z0-9_]");
          break;

        case 'W' :  /* Not "word" char. */
          strext(&pd, "[^A-Za-z0-9_]");
          break;

        case 's' :  /* Whitespace */
          strext(&pd, "[\011\012\040]");
          break;

        case 'S' :  /* Not whitespace */
          strext(&pd, "[^\011\012\040]");
          break;

        default :
          // Everything else, leave alone (& re-insert backslash)
          *(pd++) = '\\';
          *(pd++) = *ps;
      }
    }
    else if ('[' == *ps)
    {
      // Start char. set
      in_set = 1;
      *(pd++) = *ps;
    }
    else if (in_set && ']' == *ps)
    {
      // End char. set
      in_set = 0;
      *(pd++) = *ps;
    }
    else if (!in_set && '\\' == *ps)
    {
      // Clean backslash - escaped char.
      escaped = 1;
    }
    else
    {
      // Normal character - check for case-insensitivity
      if (global_case_insensitive || case_insensitive)
      {
        char lower, upper, do_it = 0;

        // Make case-insensitive class
        if (*ps >= 'A'  &&  *ps <= 'Z')
        {
          // Given upper; find lower
          lower = (char) ((int) (*ps) - 'A' + 'a');
          upper = *ps;
          do_it = 1;
        }
        else if (*ps >= 'a'  &&  *ps <= 'z')
        {
          // Given lower
          lower = *ps;
          upper = (char) ((int) (*ps) - 'a' + 'A');
          do_it = 1;
        }

        if (do_it)
        {
          // Add class
          *(pd++) = '[';
          *(pd++) = lower;
          *(pd++) = upper;
          *(pd++) = ']';
        }
        else
        {
          // Wasn't a letter
          *(pd++) = *ps;
        }
      }
      else
      {
        // Straight copy
        *(pd++) = *ps;
      }
    }
  }
  *pd = '\0';

  // Return dest. length
  *dest_len = (int) pd - (int) new_expr;

  if (*ps != '\0')
  {
    error = &ExtensionFailedError;
    return 0;
  }

  return 1; // Success
}


// Process SWIs
const _kernel_oserror *
swi_handler(int swi_number, _kernel_swi_regs *r, void *pw)
{
  struct re_registers *regs   = NULL;
  regex_t             *expr   = NULL;
  RegExBuf             buffer = NULL;

  // Maybe get buffer from handle
  if (swi_number != RegEx_GetDefaultSyntax &&
      swi_number != RegEx_SetDefaultSyntax)
  {
    if (R0)
    {
      buffer = check_buffer((RegExBuf) R0);
    }
    else
    {
      buffer = new_buffer();
      R0 = (int) buffer;
    }

    // Got one?
    if ((int) buffer != -1)
    {
      expr = buffer->expr;
      regs = buffer->regs;
    }
    else
    {
      return error;
    }
  }

  // Now process SWI - nearly all have R0 = buffer-handle
  switch (swi_number)
  {
    // R1 = pattern
    // R2 = length (strlen if -1)
    // R3 = syntax (previous/default if -1)
    //   returns:
    // R4 = error-code
    case RegEx_CompilePattern:
    {
      // Calculate length if not given
      if (R2 == -1)
        R2 = strlen((char *) R1);

      // Use previous syntax?
      if (R3 == -1)
      {
        if (expr->allocated)
          R3 = (int) expr->syntax;
        else
          R3 = default_syntax;
      }

      // Check for case-insensitivity - only allowed in CompileExtended
      if (R3 & RE_CASE_INSENSITIVE)
        return &CaseInsensitiveError;

      // Do it
      R4 = compile_pattern((char *) R1, R2, (reg_syntax_t) R3, expr);
      if (R4)
        return OSERR(R4);

      break;
    }

    //   returns:
    // R1 = error-code
    case RegEx_CompileFastmap:
    {
      // Do it
      expr->fastmap = malloc(1 << BYTEWIDTH);
      if (!expr->fastmap)
      {
        return OSERR(REG_ESPACE);
      }
      else
      {
        R1 = re_compile_fastmap(expr);
        if (R1)
          return &GNUInternalError;
      }

      break;
    }

    // R1 = string
    // R2 = length (strlen if -1)
    // R3 = start
    // R4 = range  (length-start if -1)
    //   returns:
    // R5 = position of match (or -1=none or -2=error)
    // R6 = pointer to regs
    case RegEx_Search:
    {
      // Calculate lengths if not given
      if (R2 == -1)
        R2 = strlen((char *) R1);
      if (R4 == -1)
        R4 = R2 - R3;

      // Do it
      R5 = re_search(expr, (char *) R1, R2, R3, R4, regs);
      R6 = (int) regs;

      break;
    }

    // R1 = string1
    // R2 = length1 (strlen if -1)
    // R3 = string2
    // R4 = length2 (strlen if -1)
    // R5 = start
    // R6 = range  (length1+length2-start if -1)
    // R7 = stop   (range if -1)
    //   returns:
    // R8 = position of match (or -1=none or -2=error)
    // R9 = pointer to regs
    case RegEx_Search2:
    {
      // Calculate lengths if not given
      if (R2 == -1)
        R2 = strlen((char *) R1);
      if (R4 == -1)
        R4 = strlen((char *) R3);
      if (R6 == -1)
        R6 = R2 + R4 - R5;
      if (R7 == -1)
        R7 = R6;

      // Do it
      R8 = re_search_2(
        expr, (char *) R1, R2, (char *) R3, R4, R5, R6, regs, R7);
      R9 = (int) regs;

      break;
    }

    // R1 = string
    // R2 = length (strlen if -1)
    // R3 = start
    //   returns:
    // R4 = number of matching characters
    // R5 = pointer to regs
    case RegEx_Match:
    {
      // Calculate lengths if not given
      if (R2 == -1)
        R2 = strlen((char *) R1);

      // Do it
      R4 = re_match(expr, (char *) R1, R2, R3, regs);
      R5 = (int) regs;

      break;
    }

    // R1 = string1
    // R2 = length1 (strlen if -1)
    // R3 = string2
    // R4 = length2 (strlen if -1)
    // R5 = start
    // R6 - stop
    //   returns:
    // R7 = number of matching characters
    // R8 = pointer to regs
    case RegEx_Match2:
    {
      // Calculate lengths if not given
      if (R2 == -1)
        R2 = strlen((char *) R1);
      if (R4 == -1)
        R4 = strlen((char *) R3);

      // Do it
      R7 = re_match_2(expr, (char *) R1, R2, (char *) R3, R4, R5, regs, R6);
      R8 = (int) regs;

      break;
    }

    // R1 = count
    // R2 = starts
    // R3 = ends
    case RegEx_SetRegisters:
    {
      // Maybe clear old regs
      if (!buffer->user_regs)
      {
        // They've been allocated automatically - get rid of 'em
        if (buffer->regs->start)
        {
          free(buffer->regs->start);
        }
        if (buffer->regs->end)
        {
          free(buffer->regs->end);
        }
      }
      buffer->user_regs = 1;

      // Do it
      re_set_registers(
        expr, regs, (unsigned) R1, (regoff_t *) R2, (regoff_t *) R3);

      break;
    }

    //
    case RegEx_Free:
    {
      // Do it
      free_buffer(buffer);

      // Let 'em know
      R0 = 0;

      break;
    }

    //
    case RegEx_PrintCompiledPattern:
    {
#ifdef DEBUG
      print_compiled_pattern(expr);
      break;
#else
      return &NoDebugError;
#endif
    }

    // R1 = pattern
    // R2 = syntax (previous/default if -1)
    //   returns:
    // R3 = error-code
    case RegEx_CompileExtendedPattern:
    {
      int slen = 0;

      // Use previous syntax?
      if (R2 == -1)
      {
        if (expr->allocated)
          R2 = (int) expr->syntax;
        else
          R2 = default_syntax;
      }

      // Try to convert expr
      if (!extend_pattern((char *) R1, &slen, R2 & RE_CASE_INSENSITIVE))
      {
        return error;
      }

      // Do it
      R3 = compile_pattern(new_expr, slen, (reg_syntax_t) R2, expr);
      if (R3)
        return OSERR(R3);

      break;
    }

    //   returns:
    // R0 = syntax
    case RegEx_GetDefaultSyntax:
    {
      // Let 'em know
      R0 = default_syntax;

      break;
    }

    // R0 = syntax
    case RegEx_SetDefaultSyntax:
    {
      // Do it
      default_syntax = R0;

      break;
    }

    //
    default:
    {
      return &UndefinedSWIError;
    }
  }

  // Assume all went well
  return NULL;
}

#ifdef DEBUG
_kernel_oserror *
regex_cmd(char *arg_string, int argc, int cmd_no, void *pw)
{
  UNUSED(arg_string);
  UNUSED(argc);
  UNUSED(cmd_no);
  UNUSED(pw);

  // debug stuff here ...
  printf("default_syntax = %x\n",default_syntax);

  return 0;
}

void
printchar(unsigned c)
{
  putchar(c);
}
#endif
