/tmp/build/foma/foma-0.10.0+g279~a2d32b38/lexcread.c

Bug Summary

File:	lexcread.c
Warning:	line 268, column 9 Potential leak of memory pointed to by 'sigreplace'
Annotated Source Code

Press '?' to see keyboard shortcuts
Show analyzer invocation
clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name lexcread.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/foma/foma-0.10.0+g279~a2d32b38 -resource-dir /usr/lib/llvm-16/lib/clang/16 -D _GNU_SOURCE -I /tmp/build/foma/foma-0.10.0+g279~a2d32b38 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -Wno-missing-field-initializers -Wno-deprecated -Wno-unused-parameter -std=c18 -fdebug-compilation-dir=/tmp/build/foma/foma-0.10.0+g279~a2d32b38 -ferror-limit 19 -fvisibility=hidden -fgnuc-version=4.2.1 -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/foma/scan-build/2024-09-11-155945-2678-1 -x c /tmp/build/foma/foma-0.10.0+g279~a2d32b38/lexcread.c
1/*   Foma: a finite-state toolkit and library.                                 */
2/*   Copyright © 2008-2021 Mans Hulden                                         */

4/*   This file is part of foma.                                                */

6/*   Licensed under the Apache License, Version 2.0 (the "License");           */
7/*   you may not use this file except in compliance with the License.          */
8/*   You may obtain a copy of the License at                                   */

10/*      http://www.apache.org/licenses/LICENSE-2.0                             */

12/*   Unless required by applicable law or agreed to in writing, software       */
13/*   distributed under the License is distributed on an "AS IS" BASIS,         */
14/*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  */
15/*   See the License for the specific language governing permissions and       */
16/*   limitations under the License.                                            */

18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21#include "foma.h"
22#include "lexc.h"

24#define SIGMA_HASH_TABLESIZE3079 3079

26#define WORD_ENTRY1 1
27#define REGEX_ENTRY2 2

29extern int g_lexc_align;
30extern int g_verbose;

32struct multichar_symbols {
  char *symbol;
  short int sigma_number;
  struct multichar_symbols *next;
36};

38struct lexstates {             /* Separate list of LEXICON states */
  char *name;    
  struct states *state;
  struct lexstates *next;
  unsigned char targeted;
  unsigned char has_outgoing;
44};

46struct states {
  struct trans {
      short int in;
      short int out;
      struct states *target;
      struct trans *next;
  } *trans;
  struct lexstates *lexstate; /* ptr to lexicon state */
  int number;                 /* State number (generated later) */
  unsigned int hashval;       /* Hash for remaining symbols until next lexstate */
  unsigned char mergeable;    /* Can this state be merged with other suffix */
                              /* 0 = NO, 1 = YES, 2 = DELETED/MERGED */
  unsigned short int distance;      /* Number of remaining symbols until lexstate */
  struct states *merge_with;
60};

62struct statelist {
  struct states *state;
  struct statelist *next;
  char start;
  char final;
67};

69struct lexc_hashtable {      /* Hash for looking up symbols in sigma quickly */
  char *symbol;
  struct lexc_hashtable *next;
  int sigma_number;
73};

75static unsigned int primes[26] = {61,127,251,509,1021,2039,4093,8191,16381,32749,65521,131071,262139,524287,1048573,2097143,4194301,8388593,16777213,33554393,67108859,134217689,268435399,536870909,1073741789,2147483647};

77static struct statelist *statelist = NULL((void*)0);
78static struct multichar_symbols *mc = NULL((void*)0);
79static struct lexstates *lexstates = NULL((void*)0);
80static struct sigma *lexsigma = NULL((void*)0);
81static struct lexc_hashtable *hashtable;
82static struct fsm *current_regex_network;

84static int cwordin[1000], cwordout[1000], medcwordin[2000], medcwordout[2000], carity, lexc_statecount, maxlen, hasfinal, current_entry, net_has_unknown;
85static _Bool *mchash;
86static struct lexstates *clexicon, *ctarget;

88static char *mystrncpy(char *dest, char *src, int len);
89static void lexc_string_to_tokens(char *string, int *intarr);
90static void lexc_pad();
91static void lexc_medpad();
92static void lexc_number_states();
93static void lexc_cleanup();
94static unsigned int lexc_suffix_hash(int offset);
95static unsigned int lexc_symbol_hash(char *s);
96static void lexc_update_unknowns(int sigma_number);

98static unsigned int lexc_suffix_hash(int offset) {
  register unsigned int h = 0, g, p;
  /* Read suffixes in cwordin[] and cwordout[] and return a hash value */
  for(p = offset; cwordin[p] != -1; p++) {
      h = (h << 4) + (unsigned int) (cwordin[p] | (cwordout[p] << 8));
      if (0 != (g = h & 0xf0000000)) {
          h = h ^ (g >> 24);
          h = h ^ g;
      }
  }
  /* No tablemod here, we decide on the table size later */
  return h;
110}

112static unsigned int lexc_symbol_hash(char *s) {
  register unsigned int hash;
  int c;
  hash = 5381;
  while ((c = *s++))
hash = ((hash << 5) + hash) + c;
  return (hash % SIGMA_HASH_TABLESIZE3079);
119}

121int lexc_find_sigma_hash(char *symbol) {
  int ptr;
  struct lexc_hashtable *h;
  ptr = lexc_symbol_hash(symbol);

  if ((hashtable+ptr)->symbol == NULL((void*)0))
      return -1;
  for (h = (hashtable+ptr); h != NULL((void*)0); h = h->next) {
      if (strcmp(symbol,h->symbol) == 0) {
          return (h->sigma_number);
      }
  }
  return -1;
134}

136void lexc_add_sigma_hash(char *symbol, int number) {
  int ptr;
  struct lexc_hashtable *h, *hnew;
  ptr = lexc_symbol_hash(symbol);

  if (net_has_unknown == 1)
      lexc_update_unknowns(number);

  if ((hashtable+ptr)->symbol == NULL((void*)0)) {
      (hashtable+ptr)->symbol = strdup(symbol);
      (hashtable+ptr)->sigma_number = number;
      return;
  }
  for (h = hashtable+ptr; h->next != NULL((void*)0); h = h->next) {
  }
  hnew = malloc(sizeof(struct lexc_hashtable));
  hnew->symbol = strdup(symbol);
  hnew->sigma_number = number;
  h->next = hnew;
  hnew->next = NULL((void*)0);
156}

158void lexc_init() {
  int i;
  lexsigma = sigma_create();
  mc = NULL((void*)0);
  lexstates = NULL((void*)0);
  clexicon = NULL((void*)0);
  ctarget = NULL((void*)0);
  statelist = NULL((void*)0);
  lexc_statecount = 0;
  net_has_unknown = 0;
  lexc_clear_current_word();
  hashtable = calloc(SIGMA_HASH_TABLESIZE3079, sizeof(struct lexc_hashtable));

  maxlen = 0;

  mchash = calloc(256*256, sizeof(_Bool));
  for (i=0; i< SIGMA_HASH_TABLESIZE3079; i++) {
      (hashtable+i)->symbol = NULL((void*)0);
      (hashtable+i)->sigma_number = -1;
      (hashtable+i)->next = NULL((void*)0);
  }
179}

181void lexc_clear_current_word() {
  cwordin[0] = cwordout[0] = 0;
  cwordin[1] = cwordout[1] = -1;
  current_entry = WORD_ENTRY1;
185}

187void lexc_add_state(struct states *s) {
  struct statelist *sl;    
  sl = malloc(sizeof(struct statelist));
  sl->state = s;
  s->number = -1;
  sl->next = statelist;
  sl->start = 0;
  sl->final = 0;
  statelist = sl;
  lexc_statecount++;
197}

199/* Go through the net built so far and add new transitions for @ */
200/* to reflect the new symbols we now have in sigma */
201/* We should really build a fast lookup ptr for finding the @ transitions */
202/* But who in their right mind is ever going to use lots of @ in a lexicon construction? */
203/* Of course this only applies to the special construct < regex > inside lexicon entries */
204/* since @ is impossible to produce otherwise */

206void lexc_update_unknowns(int sigma_number) {
  struct statelist *s;
  struct trans *t, *newtrans;
  for (s = statelist; s != NULL((void*)0); s = s->next) {
      if (s->state->mergeable == 2)
          continue;
      for (t=s->state->trans ; t!=NULL((void*)0); t= t->next) {
          if (t->in == IDENTITY2 || t->out == IDENTITY2) {
              newtrans = malloc(sizeof(struct trans));
              newtrans->in = sigma_number;
              newtrans->out = sigma_number;
              newtrans->target = t->target;
              newtrans->next = t->next;
              t->next = newtrans;
              }
      }
  }   
223}

225void lexc_add_network() {

  struct fsm *net;
  struct fsm_state *fsm;
  struct sigma *sigma;
  struct states **slist, *sourcestate, *deststate, *newstate;
  struct statelist *s;
  struct trans *newtrans;
  int i, j, *sigreplace, signumber, maxstate, *finals, unknown_symbols, first_new_sigma, *unk = NULL((void*)0);

  unknown_symbols = 0;
  first_new_sigma = 0;
  sourcestate = clexicon->state;
  deststate = ctarget->state;

  net = current_regex_network;
  fsm = net->states;

  sigreplace = calloc(sigma_max(net->sigma)+1,sizeof(int));
4
←
Memory is allocated→

  for (sigma = net->sigma; sigma != NULL((void*)0) && sigma->number != -1; sigma = sigma->next) {
5
←
Assuming 'sigma' is equal to NULL→
      if ((signumber = lexc_find_sigma_hash(sigma->symbol)) == -1) {
          /* Add to existing lexc sigma */
          signumber = sigma_add(sigma->symbol, lexsigma);
          first_new_sigma = first_new_sigma > 0 ? first_new_sigma : signumber;
          lexc_add_sigma_hash(sigma->symbol, signumber);
          *(sigreplace+sigma->number) = signumber;
      } else {
          /* We already have it, add to conversion table */
          *(sigreplace+sigma->number) = signumber;
      }
  }

  /* Renum arcs */
  for (i=0, maxstate = 0; (fsm+i)->state_no != -1; i++) {
6
←
Assuming the condition is false→
7
←
Loop condition is false. Execution continues on line 268→
      if ((fsm+i)->in != -1)
          (fsm+i)->in = *(sigreplace+(fsm+i)->in);
      if ((fsm+i)->out != -1)
          (fsm+i)->out = *(sigreplace+(fsm+i)->out);
      maxstate = (fsm+i)->state_no > maxstate ? (fsm+i)->state_no : maxstate;
      if ((fsm+i)->in == IDENTITY2 || (fsm+i)->in == UNKNOWN1 || (fsm+i)->out == UNKNOWN1)
          unknown_symbols = 1;
  }
  if (unknown_symbols == 1) {
8
←
Potential leak of memory pointed to by 'sigreplace'
      unk = calloc(sigma_max(lexsigma)+2,sizeof(int));
      for (i=0, sigma = lexsigma; sigma != NULL((void*)0) && sigma->number != -1; sigma=sigma->next) {
          if (sigma->number > 2 && sigma_find(sigma->symbol, net->sigma) == -1) {
              *(unk+i) = sigma->number;
              i++;
          }
      }
  }

  slist = calloc(sizeof(**slist),maxstate+1);
  finals = calloc(sizeof(int),maxstate+1);

  for (i=0; i <= maxstate;i++) {
      newstate = malloc(sizeof(struct states));
      *(slist+i) = newstate;
      newstate->trans = NULL((void*)0);
      newstate->lexstate = NULL((void*)0);
      newstate->number = -1;
      newstate->hashval = -1;
      newstate->mergeable = 0;
      newstate->distance = 0;
      newstate->merge_with = newstate;
      s = malloc(sizeof(struct statelist));
      s->state = newstate;
      s->next = statelist;
      s->start = 0;
      s->final = 0;
      statelist = s;
  }
  /* Add an EPSILON transition from sourcestate to state 0 */
  newtrans = malloc(sizeof(struct trans));
  newtrans->in = EPSILON0;
  newtrans->out = EPSILON0;
  newtrans->target = *slist;
  newtrans->next = sourcestate->trans;
  sourcestate->trans = newtrans;

  for (i=0; (fsm+i)->state_no != -1; i++) {
      if ((fsm+i)->target != -1) {
          newstate = *(slist+(fsm+i)->state_no);
          newtrans = malloc(sizeof(struct trans));
          newtrans->in = (fsm+i)->in;
          newtrans->out = (fsm+i)->out;
          newtrans->target = *(slist+(fsm+i)->target);
          newtrans->next = newstate->trans;
          newstate->trans = newtrans;
          /* Add new symbols for @:@ transitions */
          /* TODO: make this work for ?: or :? trans as well */
          if (unknown_symbols == 1) {
              if ((fsm+i)->in == IDENTITY2 || (fsm+i)->out == IDENTITY2) {
                  for (j=0; *(unk+j) != 0; j++) {
                      newtrans = malloc(sizeof(struct trans));
                      newtrans->in = *(unk+j);
                      newtrans->out = *(unk+j);
                      newtrans->target = *(slist+(fsm+i)->target);
                      newtrans->next = newstate->trans;
                      newstate->trans = newtrans;
                  }
              }
          }
      }
      finals[(fsm+i)->state_no] = (fsm+i)->final_state;
  }
  /* Add an EPSILON transition from all final states to deststate */
  for (i=0; i <= maxstate; i++) {
      if (finals[i] == 1) {
          newtrans = malloc(sizeof(struct trans));
          newtrans->in = newtrans->out = EPSILON0;
          newtrans->target = deststate;
          newstate = *(slist+i);
          newtrans->next = newstate->trans;
          newstate->trans = newtrans;
      }
  }
  if (unknown_symbols == 1) {
      free(unk);
      net_has_unknown = 1;
  }
  free(slist);
  free(finals);
349}

351void lexc_set_network(struct fsm *net) {
  current_regex_network = net;
  current_entry = REGEX_ENTRY2;
  return;
355}

357void lexc_set_current_lexicon(char *name, int which) {
  /* Sets the global lexicon variable to point to a new lexicon */
  /* the variable which = 0 indicates source, which = 1 indicated target */

  struct lexstates *l;
  struct states *newstate;

  for (l = lexstates; l != NULL((void*)0); l = l->next) {
      if (strcmp(name,l->name) == 0) {
          if (which == 0) {
l->has_outgoing = 1;
              clexicon = l;
   } else {
              ctarget = l;
   }
          return;
      }
  }
  l = malloc(sizeof(struct lexstates));
  l->next = lexstates;
  l->name = strdup(name);
  l->has_outgoing = 0;
  l->targeted = 0;
  lexstates = l;
  newstate = malloc(sizeof(struct states));
  lexc_add_state(newstate);
  newstate->lexstate = l;
  newstate->trans = NULL((void*)0);
  newstate->mergeable = 0;
  newstate->merge_with = newstate;
  l->state = newstate;
  if (which == 0) {
      clexicon = l;
l->has_outgoing = 1;
  } else { 
      ctarget = l;
  }
394}

396char *lexc_find_delim(char *name, char delimiter, char escape) {
  int i;
  for (i=0; *(name+i) != '\0'; i++) {
if (*(name+i) == escape && *(name+i+1) != '\0') {
   i++;
   continue;
}
      if (*(name+i) == delimiter) {
          return name+i;
      }
  }
  return NULL((void*)0);
408}

410void lexc_deescape_string(char *name, char escape, int mode) {
  int i, j;
  for (i=0, j=0; *(name+i) != '\0'; i++) {
      *(name+j) = *(name+i);
      if (*(name+i) == escape) {
          *(name+j) = *(name+i+1);
          j++;
          i++;
          continue;
      }
else if (mode == 1 && *(name+i) == '0') {
   /* Marks alignment EPSILON */
   *(name+j) = (unsigned char) 0xff;
   j++;
   continue;
}
      else if (*(name+i) != escape && *(name+i) != '0') {
          j++;
          continue;
      }
  }
  *(name+j) = '\0';
432}

434/* Read a string and fill cwordin, cwordout arrays */
435/* with the sigma numbers of the current word, -1 terminated */

437void lexc_set_current_word(char *name) {
  char *instring, *outstring;    
  int i;

  carity = 1;
  instring = name;
  outstring = lexc_find_delim(name,':','%');
  /* printf("CWin: [%s] CWout: [%s]\n", instring, outstring); */
  if (outstring != NULL((void*)0)) {
      *outstring = '\0';
      outstring = outstring+1;
      lexc_deescape_string(outstring,'%',1);
      carity = 2;
  }
  lexc_deescape_string(instring, '%',1);
  /* printf("CWin2: [%s] CWout2: [%s]\n", instring, outstring); */
  
  lexc_string_to_tokens(instring, cwordin);

  if (carity == 2) {
      lexc_string_to_tokens(outstring, cwordout);
if (g_lexc_align)
   lexc_medpad();
else
   lexc_pad();
  } else {
      for (i=0; *(cwordin+i) != -1; i++) {
          *(cwordout+i) = *(cwordin+i);
      }
      *(cwordout+i) = -1;

  }
  current_entry = WORD_ENTRY1;
470}


473#define LEV_DOWN0 0
474#define LEV_LEFT1 1
475#define LEV_DIAG2 2
  
477void lexc_medpad() {
  int i, j, x, y, s1len, s2len, left, down, diag, dir;
    
  if (*cwordin == -1 && *cwordout == -1) {
*cwordin = *cwordout = EPSILON0;
*(cwordin+1) = *(cwordout+1) = -1;
return;
  }
  
  for (i = 0, j = 0; cwordin[i] != -1; i++) {
  	if (cwordin[i] == EPSILON0) {
  	    continue;
  	}
  	cwordin[j] = cwordin[i];
  	j++;
  }
  cwordin[j] = -1;

  for (i = 0, j = 0; cwordout[i] != -1; i++) {
  	if (cwordout[i] == EPSILON0) {
  	    continue;
  	}
  	cwordout[j] = cwordout[i];
  	j++;
  }
  cwordout[j] = -1;
  
  for (i = 0; cwordin[i] != -1; i++) { }
  s1len = i;
  for (i = 0; cwordout[i] != -1; i++) { }
  s2len = i;
  
  int **matrix = calloc(s1len + 2, sizeof(int*));
  int** dirmatrix = calloc(s1len + 2, sizeof(int*));
  for (size_t i = 0; i < s1len + 2; ++i) {
      matrix[i] = calloc(s2len + 2, sizeof(int));
      dirmatrix[i] = calloc(s2len + 2, sizeof(int));
  }

  matrix[0][0] = 0;
  dirmatrix[0][0] = 0;
  for (x = 1; x <= s1len; x++) {
      matrix[x][0] = matrix[x-1][0] + 1;
dirmatrix[x][0] = LEV_LEFT1;
  }
  for (y = 1; y <= s2len; y++) {
      matrix[0][y] = matrix[0][y-1] + 1;
dirmatrix[0][y] = LEV_DOWN0;
  }
  for (x = 1; x <= s1len; x++) {
      for (y = 1; y <= s2len; y++) {
  	    diag = matrix[x-1][y-1] + (cwordin[x-1] == cwordout[y-1] ? 0 : 100);
  	    down =  matrix[x][y-1] + 1;
  	    left = matrix[x-1][y] + 1;
  	    if (diag <= left && diag <= down) {
  		matrix[x][y] = diag;
  		dirmatrix[x][y] = LEV_DIAG2;
  	    } else if (left <= diag && left <= down) {
  		matrix[x][y] = left;
  		dirmatrix[x][y] = LEV_LEFT1;
  	    } else {
  		matrix[x][y] = down ;
  		dirmatrix[x][y] = LEV_DOWN0;
  	    }
  	}
  }

  for (x = s1len, y = s2len, i = 0; (x > 0) || (y > 0); i++) {
dir = dirmatrix[x][y];
  	if (dir == LEV_DIAG2) {
  	    medcwordin[i] = cwordin[x-1];
  	    medcwordout[i] = cwordout[y-1];
  	    x--;
  	    y--;
  	}
  	else if (dir == LEV_DOWN0) {
  	    medcwordin[i] = EPSILON0;
  	    medcwordout[i] = cwordout[y-1];
  	    y--;
  	}
  	else {
  	    medcwordin[i] = cwordin[x-1];
   medcwordout[i] = EPSILON0;
  	    x--;
  	}
  }
  for (j = 0, i-= 1; i >= 0; j++, i--) {
  	cwordin[j] = medcwordin[i];
  	cwordout[j] = medcwordout[i];
  }
  cwordin[j] = -1;
  cwordout[j] = -1;

  for (size_t i = 0; i < s1len + 2; ++i) {
      free(matrix[i]);
      free(dirmatrix[i]);
  }
  free(matrix);
  free(dirmatrix);
576}

578void lexc_pad() {
  int i, pad;
  /* Pad the shorter of current in, out words in cwordin, cwordout with EPSILON */

  if (*cwordin == -1 && *cwordout == -1) {
*cwordin = *cwordout = EPSILON0;
*(cwordin+1) = *(cwordout+1) = -1;
return;
  }

  for (i=0, pad = 0; ;i++) {
      if (pad == 1 && *(cwordout+i) == -1) {
          *(cwordin+i) = -1;
          break;
      }
      if (pad == 2 && *(cwordin+i) == -1) {
          *(cwordout+i) = -1;
          break;
      }
      if (*(cwordin+i) == -1 && *(cwordout+i) != -1) {
          pad = 1; /* Pad upper */ 
      }
      else if (*(cwordin+i) != -1 && *(cwordout+i) == -1) {
          pad = 2; /* Pad lower */
      }
      if (pad == 1) {
          *(cwordin+i) = EPSILON0;
      }
      if (pad == 2) {
          *(cwordout+i) = EPSILON0;
      }
      if (pad == 0 && *(cwordin+i) == -1)
          break;
  }
612}

614void lexc_string_to_tokens(char *string, int *intarr) {
  int len, i, pos, skip, signumber, multi;
  unsigned int mchashval;
  char tmpstring[5];
  struct multichar_symbols *mcs;
  len = strlen(string);
  for (i=0, pos = 0; i < len; ) {

/* EPSILON for alignment is marked as 0xff */
if ((unsigned char) string[i] == 0xff) {
   *(intarr+pos) = EPSILON0;
   pos++;
   i++;
   continue;
}

      multi = 0;
      mchashval = (unsigned int) ((unsigned char) *(string+i)) * 256 + (unsigned int) ((unsigned char) *(string+i+1));
      if ((i < len-1) && *(mchash+mchashval) == 1) {
          for (mcs = mc; mcs != NULL((void*)0); mcs = mcs->next) {
              if (strncmp(string+i,mcs->symbol,strlen(mcs->symbol)) == 0) {
                  /* printf("Found multichar: [%s][%i]\n",mcs->symbol,mcs->sigma_number); */
                  multi = 1;
                  break;
              }
          }
      }

      if (multi) {
          *(intarr+pos) = mcs->sigma_number;
          pos++;
          i += strlen(mcs->symbol);
      } else {
          skip = utf8skip(string+i);
          if ((signumber = lexc_find_sigma_hash(mystrncpy(tmpstring,string+i,skip+1))) != -1) {
              *(intarr+pos) = signumber;
              pos++;
              i = i + skip + 1;
          } else {
              signumber = sigma_add(mystrncpy(tmpstring, string+i, skip+1), lexsigma);
              lexc_add_sigma_hash(tmpstring, signumber);
              *(intarr+pos) = signumber;
              pos++;
              i = i + skip + 1;
          }
      }
  }
  *(intarr+pos) = -1;
662}

664char *mystrncpy(char *dest, char *src, int len) {
  int i;
  for (i=0; i < len; i++) {
      *(dest+i) = *(src+i);
      if (*(src+i) == '\0')
          return(dest);
  }
  *(dest+i) = '\0';
672/*     printf("Mystrncpy: [%s]\n",dest); */
  return(dest);
674}

676/* Add MC to front of chain */
677/* In decreasing order of length */

679void lexc_add_mc(char *symbol) {
  int s, len;
  unsigned int mchashval;
  struct multichar_symbols *mcs, *mcprev, *mcnew;
  lexc_deescape_string(symbol,'%',0);
  if (!lexc_find_mc(symbol)) {
      len = utf8strlen(symbol);
      mcprev = NULL((void*)0);
      for (mcs = mc; mcs != NULL((void*)0) && utf8strlen(mcs->symbol) > len; mcprev = mcs, mcs=mcs->next) {
      }
      mcnew = malloc(sizeof(struct multichar_symbols));
      mcnew->symbol = strdup(symbol);
      mcnew->next = mcs;
      if ((mc == NULL((void*)0)) ||(mcs != NULL((void*)0) && mcprev == NULL((void*)0)))
          mc = mcnew;
      if (mcprev != NULL((void*)0))
          mcprev->next = mcnew;
      
      s = sigma_add(symbol, lexsigma);
      mchashval = (unsigned int) ((unsigned char) *(symbol)) * 256 + (unsigned int) ((unsigned char) *(symbol+1));    
      lexc_add_sigma_hash(symbol, s);
      *(mchash+mchashval) = 1;
      mcnew->sigma_number = s;
  }
703}

705int lexc_find_mc(char *symbol) {
  struct multichar_symbols *mcs;
  for (mcs = mc ; mcs != NULL((void*)0) ; mcs = mcs->next) {
      if (strcmp(symbol,mcs->symbol) == 0)
          return 1;
  }
  return 0;
712}

714struct states *lexc_find_lex_state(char *name) {
  struct lexstates *l;
  for (l = lexstates ; l != NULL((void*)0); l = l->next) {
      if (strcmp(name,l->name) == 0)
          return (l->state);
  }
  return NULL((void*)0);
721}

723void lexc_add_word() {
  /** Add a word from source state to destination state */
  struct trans *newtrans, *trans;
  struct states *sourcestate, *deststate, *newstate;
  int i, follow, len;

  if (current_entry == REGEX_ENTRY2) {
1
Assuming 'current_entry' is equal to REGEX_ENTRY→
2
←
Taking true branch→
      lexc_add_network();
3
←
Calling 'lexc_add_network'→
      return;
  }
          
  /* find source, dest */
  sourcestate = clexicon->state;
  deststate = ctarget->state;

  for (i=0; *(cwordin+i) != -1; i++) {}
  len = i;
  maxlen = len > maxlen ? len : maxlen;
  
  /* We follow the source state if the symbols are the same */
  /* To merge prefixes */
  for (follow = 1, i=0; *(cwordin+i) != -1; i++) {
      
      if (follow == 1) {
          for (trans = sourcestate->trans; trans != NULL((void*)0) ; trans = trans->next) {
              if (trans->in == *(cwordin+i) && trans->out == *(cwordout+i) && trans->target->lexstate == NULL((void*)0)) {
                  /* Can't follow if target needs to be lexstate */
                  if (*(cwordin+i+1) == -1 && trans->target != deststate) {
                      continue;
                  }
                  sourcestate = trans->target;
                  sourcestate->mergeable = 0;
                  /* Breakout */
                  goto breakout;
              }
          }
      }
      follow = 0;

      newtrans = malloc(sizeof(struct trans));
      if (*(cwordin+i+1) == -1) {
          newtrans->target = deststate;
      } else {
          newstate = malloc(sizeof(struct states));
          lexc_add_state(newstate);
          newtrans->target = newstate;
          newstate->trans = NULL((void*)0);
          newstate->lexstate = NULL((void*)0);
          newstate->mergeable = 1;
          newstate->hashval = lexc_suffix_hash(i+1);
          newstate->distance = len - i - 1;
          newstate->merge_with = newstate;
      }
      newtrans->next = sourcestate->trans;
      sourcestate->trans = newtrans;

      newtrans->in = *(cwordin+i);
      newtrans->out = *(cwordout+i);

      sourcestate = newtrans->target;
  breakout:;
      
  }
  return;
787}

789void lexc_number_states() {
  int n, smax, hasroot;
  struct statelist *s;
  struct lexstates *l;

  smax = n = hasfinal = 0;

  for (hasroot = 0, s = statelist; s != NULL((void*)0); s = s->next) {
      smax++;
      if (s->state->lexstate != NULL((void*)0) && strcmp(s->state->lexstate->name, "Root") == 0) {
          s->state->number = 0;
          s->start = 1;
          n++;
          hasroot = 1;
          break;
      }
  }
  /* If there is no Root lexicon, the first lexicon mentioned is Root */
  if (!hasroot) {
      for (s = statelist; s != NULL((void*)0); s = s->next) {        
          if (s->next == NULL((void*)0)) {
              s->state->number = 0;
              if (g_verbose)
              {
                  fprintf(stderrstderr,"*Warning: no Root lexicon, using '%s' as Root.\n",s->state->lexstate->name);
                  fflush(stderrstderr);
              }
              s->start = 1;
              n++;
          }
      }
  }
  /* Mark # as the last state */
  for (s = statelist; s != NULL((void*)0); s = s->next) {
      if (s->state->lexstate != NULL((void*)0) && strcmp(s->state->lexstate->name, "#") == 0) {
          s->state->number = smax-1;
          s->final = 1;
          hasfinal = 1;
      } else if (s->state->lexstate != NULL((void*)0) && strcmp(s->state->lexstate->name, "#") != 0 && s->state->lexstate->has_outgoing == 0) {
   /* Also mark uncontinued states as final (this is warned about elsewhere) */
          s->final = 1;
}
  }

  for (s = statelist; s != NULL((void*)0); s = s->next) { 
      if (s->state->number == -1) {
          s->state->number = n;
          n++;
      }
  }
  lexc_statecount = n+1;
  for (l = lexstates; l != NULL((void*)0) ; l = l->next) {
      if (l->targeted == 0 && l->state->number != 0) {
          if (g_verbose)
          {
              fprintf(stderrstderr,"*Warning: lexicon '%s' defined but not used\n",l->name);
              fflush(stderrstderr);
          }
      }
      if (l->has_outgoing == 0 && strcmp(l->name, "#") != 0) {
          if (g_verbose)
          {
              fprintf(stderrstderr,"***Warning: lexicon '%s' used but never defined\n",l->name);
              fflush(stderrstderr);
          }
      }
  }
856}

858int lexc_eq_paths(struct states *one, struct states *two) {
  while (one->lexstate == NULL((void*)0) && two->lexstate == NULL((void*)0)) {
      if (one->trans->in != two->trans->in || one->trans->out != two->trans->out)
          return 0;
      one = one->trans->target;
      two = two->trans->target;
  }
  if (one->lexstate != two->lexstate)
      return 0;
  return 1;
868}

870void lexc_merge_states() {
  struct lenlist {
      struct states *state;
      struct lenlist *next;
  };
  struct hashstates {
      struct states *state;
      struct hashstates *next;
  } *hashstates, *currenth, *newh;

  struct lenlist *lenlist, *newl, *currentl;
  struct statelist *s, *sprev, *sf;
  struct states *state, *purgestate;
  struct trans *t, *tprev;
  int i, numstates, tablesize, hash;

  /* Create array of ptrs to states depending on string length */
  lenlist = calloc(maxlen+1,sizeof(struct lenlist));
  numstates = 0;
  for (s = statelist ; s!= NULL((void*)0); s = s->next) {
      if (s->state->mergeable)
          numstates++;
  }

  /* Find a suitable prime for hashing: proportional to the size of the */
  /* number of mergeable states */

  for (i = 0; primes[i] < numstates/4; i++) { }    
  tablesize = primes[i];
  hashstates = calloc(tablesize,sizeof(struct hashstates));

  for (s = statelist ; s!= NULL((void*)0); s = s->next) {
      if (s->state->mergeable) {
          numstates++;
          currentl = lenlist+(s->state->distance);
          if (currentl->state == NULL((void*)0))
              currentl->state = s->state;
          else {
              newl = calloc(1,sizeof(struct lenlist));
              newl->state = s->state;
              newl->next = currentl->next;
              currentl->next = newl;
          }           
          s->state->hashval = s->state->hashval % tablesize;
          currenth = hashstates+s->state->hashval;
          if (currenth->state == NULL((void*)0)) {
              currenth->state = s->state;
          } else {
              newh = calloc(1,sizeof(struct hashstates));
              newh->state = s->state;
              newh->next = currenth->next;
              currenth->next = newh; 
          }
      }
  }
  
  for (i = maxlen; i >= 1 ; i--) {
      /* printf("Analyzing: [%i]...",i); fflush(stdout); */
      for (currentl = (lenlist+i); currentl != NULL((void*)0); currentl = currentl->next) {
          if (currentl->state == NULL((void*)0))
              break;
          if (currentl->state->mergeable != 1)
              continue;
          /* Find states hashing to same value as current */
          state = currentl->state;
          hash = state->hashval;
          for (currenth = hashstates+hash; currenth != NULL((void*)0); currenth = currenth->next) {
              /* Merge */
              if (currenth->state != state && currenth->state->mergeable == 1 && currenth->state->distance == state->distance && lexc_eq_paths(currenth->state,state)) {
                  currenth->state->merge_with = state;
                  for (purgestate = currenth->state; purgestate->lexstate == NULL((void*)0); purgestate = purgestate->trans->target) {
                      purgestate->mergeable = 2;
                  }
              }
          }
      }
  }

  /* Go through statelist and remove merged states and free states, trans */
  
  for (s = statelist, sprev = NULL((void*)0); s != NULL((void*)0); s = s->next) {
      for (t = s->state->trans, tprev = NULL((void*)0); t != NULL((void*)0); tprev = t, t = t->next) {
          t->target = t->target->merge_with;
          if (tprev != NULL((void*)0) && s->state->mergeable == 2) {
              free(tprev);
          } else {
              if (t->target->lexstate != NULL((void*)0))
                  t->target->lexstate->targeted = 1;
          }
      }
      if (tprev != NULL((void*)0) && s->state->mergeable == 2)
          free(tprev);
  }
  for (s = statelist, sprev = NULL((void*)0); s != NULL((void*)0); ) {
      if (s->state->mergeable == 2) {
          if (sprev != NULL((void*)0)) {
              sprev->next = s->next;                
          } else {
              statelist = s;
          }
          free(s->state);
          sf = s;
          s = s->next;
          free(sf);
      } else {
          sprev = s;
          s = s ->next;
      }
  }

  /* Cleanup */

  for (i = 0; i < maxlen ; i++) {
      newl = NULL((void*)0);
      for (currentl = (lenlist+i)->next; currentl != NULL((void*)0) ;currentl=currentl->next) {
          if (newl != NULL((void*)0))
              free(newl);
          newl = currentl;
      }
      if (newl != NULL((void*)0))
          free(newl);
  }
  for (i = 0; i < tablesize ; i++) {
      newh = NULL((void*)0);
      for (currenth = (hashstates+i)->next; currenth != NULL((void*)0) ;currenth=currenth->next) {
          if (newh != NULL((void*)0))
              free(newh);
          newh = currenth;
      }
      if (newh != NULL((void*)0))
          free(newh);
  }
  free(hashstates);
  free(lenlist);
1004}

1006struct fsm *lexc_to_fsm() {
  struct statelist *s, *sa;
  struct fsm_state *fsm;
  struct fsm *net;
  struct trans *t;
  int i, j,  linecount;

  if (g_verbose)
  {
      fprintf(stderrstderr,"Building lexicon...\n");
      fflush(stderrstderr);
  }
  lexc_merge_states();
  net = fsm_create("");
  free(net->sigma);
  net->sigma = lexsigma;
  lexc_number_states();
  if (hasfinal == 0) {
      if (g_verbose)
      {
          fprintf(stderrstderr,"Warning: # is never reached!!!\n");
          fflush(stderrstderr);
      }
      return(fsm_empty_set());
  }
  sa = malloc(sizeof(struct statelist)*lexc_statecount);
  for (s = statelist; s != NULL((void*)0); s = s->next) {
      sa[s->state->number].state = s->state;
      sa[s->state->number].start = s->start;
      sa[s->state->number].final = s->final;
  }
  linecount = 0;
  for (s = statelist; s != NULL((void*)0); s = s->next) {
      linecount++;
      for (t = s->state->trans; t != NULL((void*)0); t = t->next)
          linecount++;
  }
  fsm = malloc(sizeof(struct fsm_state)*(linecount+1));
  for (i = 0, j = 0, s = sa; j < lexc_statecount; j++) {
      if (s[j].state->trans == NULL((void*)0)) {
          add_fsm_arc(fsm,i,s[j].state->number, -1, -1, -1, s[j].final, s[j].start);
          i++;
      } else {
          for (t = s[j].state->trans; t != NULL((void*)0); t = t->next) {
              add_fsm_arc(fsm,i,s[j].state->number,t->in,t->out,t->target->number,s[j].final,s[j].start);
              i++;
          }
      }
  }
  add_fsm_arc(fsm, i, -1, -1, -1, -1, -1, -1);
  net->states = fsm;
  net->statecount = lexc_statecount;
  fsm_update_flags(net, UNK2, UNK2, UNK2, UNK2, UNK2, UNK2);
  if (sigma_find_number(EPSILON0, lexsigma) == -1)
      sigma_add_special(EPSILON0, lexsigma);
  free(s);
  lexc_cleanup();
  sigma_cleanup(net,0);
  sigma_sort(net);
  
  if (g_verbose)
  {
      fprintf(stderrstderr,"Determinizing...\n");
      fflush(stderrstderr);
  }
  net = fsm_determinize(net);
  if (g_verbose)
  {
      fprintf(stderrstderr,"Minimizing...\n");
      fflush(stderrstderr);
  }
  net = fsm_topsort(fsm_minimize(net));
  if (g_verbose)
  {
      fprintf(stderrstderr,"Done!\n");
      fflush(stderrstderr);
  }
  return(net);
1084}

1086void lexc_cleanup() {
  struct lexstates *l, *ln;
  struct statelist *s, *sn;
  struct trans *t, *tn;
  struct multichar_symbols *mcs, *mcsn;
  struct lexc_hashtable *lhash, *lprev;
  int i;
  free(mchash);
  for (i=0; i < SIGMA_HASH_TABLESIZE3079; i++) {
      for (lhash = hashtable+i; lhash != NULL((void*)0); ) {
          if (lhash->symbol != NULL((void*)0)) {
              free(lhash->symbol);
          }
          lprev = lhash;
          lhash = lhash->next;
          if (lprev != hashtable+i) { free(lprev); }
      }
  }
  free(hashtable);
  for (mcs = mc ; mcs != NULL((void*)0) ; mcs = mcsn) {
      mcsn = mcs->next;
free(mcs->symbol);
      free(mcs);
  }
  for (l = lexstates ; l != NULL((void*)0) ; l = ln) {
      ln = l->next;
      free(l->name);
      free(l);
  }
  for (s = statelist; s != NULL((void*)0); s = s->next) {
      for (t = s->state->trans; t != NULL((void*)0); t = tn) {
          tn = t->next;
          free(t);
      }
      free(s->state);
  }
  for (s = statelist; s != NULL((void*)0); s = sn) {
      sn = s->next;
      free(s);
  }
1126}