Bug Summary

File:lexcread.c
Warning:line 278, column 13
Result of 'calloc' is converted to a pointer of type 'struct states *', which is incompatible with sizeof operand type 'struct states'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name lexcread.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/foma/foma-0.10.0+g279~a2d32b38 -resource-dir /usr/lib/llvm-16/lib/clang/16 -D _GNU_SOURCE -I /tmp/build/foma/foma-0.10.0+g279~a2d32b38 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -Wno-missing-field-initializers -Wno-deprecated -Wno-unused-parameter -std=c18 -fdebug-compilation-dir=/tmp/build/foma/foma-0.10.0+g279~a2d32b38 -ferror-limit 19 -fvisibility=hidden -fgnuc-version=4.2.1 -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/foma/scan-build/2024-09-11-155945-2678-1 -x c /tmp/build/foma/foma-0.10.0+g279~a2d32b38/lexcread.c
1/* Foma: a finite-state toolkit and library. */
2/* Copyright © 2008-2021 Mans Hulden */
3
4/* This file is part of foma. */
5
6/* Licensed under the Apache License, Version 2.0 (the "License"); */
7/* you may not use this file except in compliance with the License. */
8/* You may obtain a copy of the License at */
9
10/* http://www.apache.org/licenses/LICENSE-2.0 */
11
12/* Unless required by applicable law or agreed to in writing, software */
13/* distributed under the License is distributed on an "AS IS" BASIS, */
14/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */
15/* See the License for the specific language governing permissions and */
16/* limitations under the License. */
17
18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21#include "foma.h"
22#include "lexc.h"
23
24#define SIGMA_HASH_TABLESIZE3079 3079
25
26#define WORD_ENTRY1 1
27#define REGEX_ENTRY2 2
28
29extern int g_lexc_align;
30extern int g_verbose;
31
32struct multichar_symbols {
33 char *symbol;
34 short int sigma_number;
35 struct multichar_symbols *next;
36};
37
38struct lexstates { /* Separate list of LEXICON states */
39 char *name;
40 struct states *state;
41 struct lexstates *next;
42 unsigned char targeted;
43 unsigned char has_outgoing;
44};
45
46struct states {
47 struct trans {
48 short int in;
49 short int out;
50 struct states *target;
51 struct trans *next;
52 } *trans;
53 struct lexstates *lexstate; /* ptr to lexicon state */
54 int number; /* State number (generated later) */
55 unsigned int hashval; /* Hash for remaining symbols until next lexstate */
56 unsigned char mergeable; /* Can this state be merged with other suffix */
57 /* 0 = NO, 1 = YES, 2 = DELETED/MERGED */
58 unsigned short int distance; /* Number of remaining symbols until lexstate */
59 struct states *merge_with;
60};
61
62struct statelist {
63 struct states *state;
64 struct statelist *next;
65 char start;
66 char final;
67};
68
69struct lexc_hashtable { /* Hash for looking up symbols in sigma quickly */
70 char *symbol;
71 struct lexc_hashtable *next;
72 int sigma_number;
73};
74
75static unsigned int primes[26] = {61,127,251,509,1021,2039,4093,8191,16381,32749,65521,131071,262139,524287,1048573,2097143,4194301,8388593,16777213,33554393,67108859,134217689,268435399,536870909,1073741789,2147483647};
76
77static struct statelist *statelist = NULL((void*)0);
78static struct multichar_symbols *mc = NULL((void*)0);
79static struct lexstates *lexstates = NULL((void*)0);
80static struct sigma *lexsigma = NULL((void*)0);
81static struct lexc_hashtable *hashtable;
82static struct fsm *current_regex_network;
83
84static int cwordin[1000], cwordout[1000], medcwordin[2000], medcwordout[2000], carity, lexc_statecount, maxlen, hasfinal, current_entry, net_has_unknown;
85static _Bool *mchash;
86static struct lexstates *clexicon, *ctarget;
87
88static char *mystrncpy(char *dest, char *src, int len);
89static void lexc_string_to_tokens(char *string, int *intarr);
90static void lexc_pad();
91static void lexc_medpad();
92static void lexc_number_states();
93static void lexc_cleanup();
94static unsigned int lexc_suffix_hash(int offset);
95static unsigned int lexc_symbol_hash(char *s);
96static void lexc_update_unknowns(int sigma_number);
97
98static unsigned int lexc_suffix_hash(int offset) {
99 register unsigned int h = 0, g, p;
100 /* Read suffixes in cwordin[] and cwordout[] and return a hash value */
101 for(p = offset; cwordin[p] != -1; p++) {
102 h = (h << 4) + (unsigned int) (cwordin[p] | (cwordout[p] << 8));
103 if (0 != (g = h & 0xf0000000)) {
104 h = h ^ (g >> 24);
105 h = h ^ g;
106 }
107 }
108 /* No tablemod here, we decide on the table size later */
109 return h;
110}
111
112static unsigned int lexc_symbol_hash(char *s) {
113 register unsigned int hash;
114 int c;
115 hash = 5381;
116 while ((c = *s++))
117 hash = ((hash << 5) + hash) + c;
118 return (hash % SIGMA_HASH_TABLESIZE3079);
119}
120
121int lexc_find_sigma_hash(char *symbol) {
122 int ptr;
123 struct lexc_hashtable *h;
124 ptr = lexc_symbol_hash(symbol);
125
126 if ((hashtable+ptr)->symbol == NULL((void*)0))
127 return -1;
128 for (h = (hashtable+ptr); h != NULL((void*)0); h = h->next) {
129 if (strcmp(symbol,h->symbol) == 0) {
130 return (h->sigma_number);
131 }
132 }
133 return -1;
134}
135
136void lexc_add_sigma_hash(char *symbol, int number) {
137 int ptr;
138 struct lexc_hashtable *h, *hnew;
139 ptr = lexc_symbol_hash(symbol);
140
141 if (net_has_unknown == 1)
142 lexc_update_unknowns(number);
143
144 if ((hashtable+ptr)->symbol == NULL((void*)0)) {
145 (hashtable+ptr)->symbol = strdup(symbol);
146 (hashtable+ptr)->sigma_number = number;
147 return;
148 }
149 for (h = hashtable+ptr; h->next != NULL((void*)0); h = h->next) {
150 }
151 hnew = malloc(sizeof(struct lexc_hashtable));
152 hnew->symbol = strdup(symbol);
153 hnew->sigma_number = number;
154 h->next = hnew;
155 hnew->next = NULL((void*)0);
156}
157
158void lexc_init() {
159 int i;
160 lexsigma = sigma_create();
161 mc = NULL((void*)0);
162 lexstates = NULL((void*)0);
163 clexicon = NULL((void*)0);
164 ctarget = NULL((void*)0);
165 statelist = NULL((void*)0);
166 lexc_statecount = 0;
167 net_has_unknown = 0;
168 lexc_clear_current_word();
169 hashtable = calloc(SIGMA_HASH_TABLESIZE3079, sizeof(struct lexc_hashtable));
170
171 maxlen = 0;
172
173 mchash = calloc(256*256, sizeof(_Bool));
174 for (i=0; i< SIGMA_HASH_TABLESIZE3079; i++) {
175 (hashtable+i)->symbol = NULL((void*)0);
176 (hashtable+i)->sigma_number = -1;
177 (hashtable+i)->next = NULL((void*)0);
178 }
179}
180
181void lexc_clear_current_word() {
182 cwordin[0] = cwordout[0] = 0;
183 cwordin[1] = cwordout[1] = -1;
184 current_entry = WORD_ENTRY1;
185}
186
187void lexc_add_state(struct states *s) {
188 struct statelist *sl;
189 sl = malloc(sizeof(struct statelist));
190 sl->state = s;
191 s->number = -1;
192 sl->next = statelist;
193 sl->start = 0;
194 sl->final = 0;
195 statelist = sl;
196 lexc_statecount++;
197}
198
199/* Go through the net built so far and add new transitions for @ */
200/* to reflect the new symbols we now have in sigma */
201/* We should really build a fast lookup ptr for finding the @ transitions */
202/* But who in their right mind is ever going to use lots of @ in a lexicon construction? */
203/* Of course this only applies to the special construct < regex > inside lexicon entries */
204/* since @ is impossible to produce otherwise */
205
206void lexc_update_unknowns(int sigma_number) {
207 struct statelist *s;
208 struct trans *t, *newtrans;
209 for (s = statelist; s != NULL((void*)0); s = s->next) {
210 if (s->state->mergeable == 2)
211 continue;
212 for (t=s->state->trans ; t!=NULL((void*)0); t= t->next) {
213 if (t->in == IDENTITY2 || t->out == IDENTITY2) {
214 newtrans = malloc(sizeof(struct trans));
215 newtrans->in = sigma_number;
216 newtrans->out = sigma_number;
217 newtrans->target = t->target;
218 newtrans->next = t->next;
219 t->next = newtrans;
220 }
221 }
222 }
223}
224
225void lexc_add_network() {
226
227 struct fsm *net;
228 struct fsm_state *fsm;
229 struct sigma *sigma;
230 struct states **slist, *sourcestate, *deststate, *newstate;
231 struct statelist *s;
232 struct trans *newtrans;
233 int i, j, *sigreplace, signumber, maxstate, *finals, unknown_symbols, first_new_sigma, *unk = NULL((void*)0);
234
235 unknown_symbols = 0;
236 first_new_sigma = 0;
237 sourcestate = clexicon->state;
238 deststate = ctarget->state;
239
240 net = current_regex_network;
241 fsm = net->states;
242
243 sigreplace = calloc(sigma_max(net->sigma)+1,sizeof(int));
244
245 for (sigma = net->sigma; sigma != NULL((void*)0) && sigma->number != -1; sigma = sigma->next) {
246 if ((signumber = lexc_find_sigma_hash(sigma->symbol)) == -1) {
247 /* Add to existing lexc sigma */
248 signumber = sigma_add(sigma->symbol, lexsigma);
249 first_new_sigma = first_new_sigma > 0 ? first_new_sigma : signumber;
250 lexc_add_sigma_hash(sigma->symbol, signumber);
251 *(sigreplace+sigma->number) = signumber;
252 } else {
253 /* We already have it, add to conversion table */
254 *(sigreplace+sigma->number) = signumber;
255 }
256 }
257
258 /* Renum arcs */
259 for (i=0, maxstate = 0; (fsm+i)->state_no != -1; i++) {
260 if ((fsm+i)->in != -1)
261 (fsm+i)->in = *(sigreplace+(fsm+i)->in);
262 if ((fsm+i)->out != -1)
263 (fsm+i)->out = *(sigreplace+(fsm+i)->out);
264 maxstate = (fsm+i)->state_no > maxstate ? (fsm+i)->state_no : maxstate;
265 if ((fsm+i)->in == IDENTITY2 || (fsm+i)->in == UNKNOWN1 || (fsm+i)->out == UNKNOWN1)
266 unknown_symbols = 1;
267 }
268 if (unknown_symbols == 1) {
269 unk = calloc(sigma_max(lexsigma)+2,sizeof(int));
270 for (i=0, sigma = lexsigma; sigma != NULL((void*)0) && sigma->number != -1; sigma=sigma->next) {
271 if (sigma->number > 2 && sigma_find(sigma->symbol, net->sigma) == -1) {
272 *(unk+i) = sigma->number;
273 i++;
274 }
275 }
276 }
277
278 slist = calloc(sizeof(**slist),maxstate+1);
Result of 'calloc' is converted to a pointer of type 'struct states *', which is incompatible with sizeof operand type 'struct states'
279 finals = calloc(sizeof(int),maxstate+1);
280
281 for (i=0; i <= maxstate;i++) {
282 newstate = malloc(sizeof(struct states));
283 *(slist+i) = newstate;
284 newstate->trans = NULL((void*)0);
285 newstate->lexstate = NULL((void*)0);
286 newstate->number = -1;
287 newstate->hashval = -1;
288 newstate->mergeable = 0;
289 newstate->distance = 0;
290 newstate->merge_with = newstate;
291 s = malloc(sizeof(struct statelist));
292 s->state = newstate;
293 s->next = statelist;
294 s->start = 0;
295 s->final = 0;
296 statelist = s;
297 }
298 /* Add an EPSILON transition from sourcestate to state 0 */
299 newtrans = malloc(sizeof(struct trans));
300 newtrans->in = EPSILON0;
301 newtrans->out = EPSILON0;
302 newtrans->target = *slist;
303 newtrans->next = sourcestate->trans;
304 sourcestate->trans = newtrans;
305
306 for (i=0; (fsm+i)->state_no != -1; i++) {
307 if ((fsm+i)->target != -1) {
308 newstate = *(slist+(fsm+i)->state_no);
309 newtrans = malloc(sizeof(struct trans));
310 newtrans->in = (fsm+i)->in;
311 newtrans->out = (fsm+i)->out;
312 newtrans->target = *(slist+(fsm+i)->target);
313 newtrans->next = newstate->trans;
314 newstate->trans = newtrans;
315 /* Add new symbols for @:@ transitions */
316 /* TODO: make this work for ?: or :? trans as well */
317 if (unknown_symbols == 1) {
318 if ((fsm+i)->in == IDENTITY2 || (fsm+i)->out == IDENTITY2) {
319 for (j=0; *(unk+j) != 0; j++) {
320 newtrans = malloc(sizeof(struct trans));
321 newtrans->in = *(unk+j);
322 newtrans->out = *(unk+j);
323 newtrans->target = *(slist+(fsm+i)->target);
324 newtrans->next = newstate->trans;
325 newstate->trans = newtrans;
326 }
327 }
328 }
329 }
330 finals[(fsm+i)->state_no] = (fsm+i)->final_state;
331 }
332 /* Add an EPSILON transition from all final states to deststate */
333 for (i=0; i <= maxstate; i++) {
334 if (finals[i] == 1) {
335 newtrans = malloc(sizeof(struct trans));
336 newtrans->in = newtrans->out = EPSILON0;
337 newtrans->target = deststate;
338 newstate = *(slist+i);
339 newtrans->next = newstate->trans;
340 newstate->trans = newtrans;
341 }
342 }
343 if (unknown_symbols == 1) {
344 free(unk);
345 net_has_unknown = 1;
346 }
347 free(slist);
348 free(finals);
349}
350
351void lexc_set_network(struct fsm *net) {
352 current_regex_network = net;
353 current_entry = REGEX_ENTRY2;
354 return;
355}
356
357void lexc_set_current_lexicon(char *name, int which) {
358 /* Sets the global lexicon variable to point to a new lexicon */
359 /* the variable which = 0 indicates source, which = 1 indicated target */
360
361 struct lexstates *l;
362 struct states *newstate;
363
364 for (l = lexstates; l != NULL((void*)0); l = l->next) {
365 if (strcmp(name,l->name) == 0) {
366 if (which == 0) {
367 l->has_outgoing = 1;
368 clexicon = l;
369 } else {
370 ctarget = l;
371 }
372 return;
373 }
374 }
375 l = malloc(sizeof(struct lexstates));
376 l->next = lexstates;
377 l->name = strdup(name);
378 l->has_outgoing = 0;
379 l->targeted = 0;
380 lexstates = l;
381 newstate = malloc(sizeof(struct states));
382 lexc_add_state(newstate);
383 newstate->lexstate = l;
384 newstate->trans = NULL((void*)0);
385 newstate->mergeable = 0;
386 newstate->merge_with = newstate;
387 l->state = newstate;
388 if (which == 0) {
389 clexicon = l;
390 l->has_outgoing = 1;
391 } else {
392 ctarget = l;
393 }
394}
395
396char *lexc_find_delim(char *name, char delimiter, char escape) {
397 int i;
398 for (i=0; *(name+i) != '\0'; i++) {
399 if (*(name+i) == escape && *(name+i+1) != '\0') {
400 i++;
401 continue;
402 }
403 if (*(name+i) == delimiter) {
404 return name+i;
405 }
406 }
407 return NULL((void*)0);
408}
409
410void lexc_deescape_string(char *name, char escape, int mode) {
411 int i, j;
412 for (i=0, j=0; *(name+i) != '\0'; i++) {
413 *(name+j) = *(name+i);
414 if (*(name+i) == escape) {
415 *(name+j) = *(name+i+1);
416 j++;
417 i++;
418 continue;
419 }
420 else if (mode == 1 && *(name+i) == '0') {
421 /* Marks alignment EPSILON */
422 *(name+j) = (unsigned char) 0xff;
423 j++;
424 continue;
425 }
426 else if (*(name+i) != escape && *(name+i) != '0') {
427 j++;
428 continue;
429 }
430 }
431 *(name+j) = '\0';
432}
433
434/* Read a string and fill cwordin, cwordout arrays */
435/* with the sigma numbers of the current word, -1 terminated */
436
437void lexc_set_current_word(char *name) {
438 char *instring, *outstring;
439 int i;
440
441 carity = 1;
442 instring = name;
443 outstring = lexc_find_delim(name,':','%');
444 /* printf("CWin: [%s] CWout: [%s]\n", instring, outstring); */
445 if (outstring != NULL((void*)0)) {
446 *outstring = '\0';
447 outstring = outstring+1;
448 lexc_deescape_string(outstring,'%',1);
449 carity = 2;
450 }
451 lexc_deescape_string(instring, '%',1);
452 /* printf("CWin2: [%s] CWout2: [%s]\n", instring, outstring); */
453
454 lexc_string_to_tokens(instring, cwordin);
455
456 if (carity == 2) {
457 lexc_string_to_tokens(outstring, cwordout);
458 if (g_lexc_align)
459 lexc_medpad();
460 else
461 lexc_pad();
462 } else {
463 for (i=0; *(cwordin+i) != -1; i++) {
464 *(cwordout+i) = *(cwordin+i);
465 }
466 *(cwordout+i) = -1;
467
468 }
469 current_entry = WORD_ENTRY1;
470}
471
472
473#define LEV_DOWN0 0
474#define LEV_LEFT1 1
475#define LEV_DIAG2 2
476
477void lexc_medpad() {
478 int i, j, x, y, s1len, s2len, left, down, diag, dir;
479
480 if (*cwordin == -1 && *cwordout == -1) {
481 *cwordin = *cwordout = EPSILON0;
482 *(cwordin+1) = *(cwordout+1) = -1;
483 return;
484 }
485
486 for (i = 0, j = 0; cwordin[i] != -1; i++) {
487 if (cwordin[i] == EPSILON0) {
488 continue;
489 }
490 cwordin[j] = cwordin[i];
491 j++;
492 }
493 cwordin[j] = -1;
494
495 for (i = 0, j = 0; cwordout[i] != -1; i++) {
496 if (cwordout[i] == EPSILON0) {
497 continue;
498 }
499 cwordout[j] = cwordout[i];
500 j++;
501 }
502 cwordout[j] = -1;
503
504 for (i = 0; cwordin[i] != -1; i++) { }
505 s1len = i;
506 for (i = 0; cwordout[i] != -1; i++) { }
507 s2len = i;
508
509 int **matrix = calloc(s1len + 2, sizeof(int*));
510 int** dirmatrix = calloc(s1len + 2, sizeof(int*));
511 for (size_t i = 0; i < s1len + 2; ++i) {
512 matrix[i] = calloc(s2len + 2, sizeof(int));
513 dirmatrix[i] = calloc(s2len + 2, sizeof(int));
514 }
515
516 matrix[0][0] = 0;
517 dirmatrix[0][0] = 0;
518 for (x = 1; x <= s1len; x++) {
519 matrix[x][0] = matrix[x-1][0] + 1;
520 dirmatrix[x][0] = LEV_LEFT1;
521 }
522 for (y = 1; y <= s2len; y++) {
523 matrix[0][y] = matrix[0][y-1] + 1;
524 dirmatrix[0][y] = LEV_DOWN0;
525 }
526 for (x = 1; x <= s1len; x++) {
527 for (y = 1; y <= s2len; y++) {
528 diag = matrix[x-1][y-1] + (cwordin[x-1] == cwordout[y-1] ? 0 : 100);
529 down = matrix[x][y-1] + 1;
530 left = matrix[x-1][y] + 1;
531 if (diag <= left && diag <= down) {
532 matrix[x][y] = diag;
533 dirmatrix[x][y] = LEV_DIAG2;
534 } else if (left <= diag && left <= down) {
535 matrix[x][y] = left;
536 dirmatrix[x][y] = LEV_LEFT1;
537 } else {
538 matrix[x][y] = down ;
539 dirmatrix[x][y] = LEV_DOWN0;
540 }
541 }
542 }
543
544 for (x = s1len, y = s2len, i = 0; (x > 0) || (y > 0); i++) {
545 dir = dirmatrix[x][y];
546 if (dir == LEV_DIAG2) {
547 medcwordin[i] = cwordin[x-1];
548 medcwordout[i] = cwordout[y-1];
549 x--;
550 y--;
551 }
552 else if (dir == LEV_DOWN0) {
553 medcwordin[i] = EPSILON0;
554 medcwordout[i] = cwordout[y-1];
555 y--;
556 }
557 else {
558 medcwordin[i] = cwordin[x-1];
559 medcwordout[i] = EPSILON0;
560 x--;
561 }
562 }
563 for (j = 0, i-= 1; i >= 0; j++, i--) {
564 cwordin[j] = medcwordin[i];
565 cwordout[j] = medcwordout[i];
566 }
567 cwordin[j] = -1;
568 cwordout[j] = -1;
569
570 for (size_t i = 0; i < s1len + 2; ++i) {
571 free(matrix[i]);
572 free(dirmatrix[i]);
573 }
574 free(matrix);
575 free(dirmatrix);
576}
577
578void lexc_pad() {
579 int i, pad;
580 /* Pad the shorter of current in, out words in cwordin, cwordout with EPSILON */
581
582 if (*cwordin == -1 && *cwordout == -1) {
583 *cwordin = *cwordout = EPSILON0;
584 *(cwordin+1) = *(cwordout+1) = -1;
585 return;
586 }
587
588 for (i=0, pad = 0; ;i++) {
589 if (pad == 1 && *(cwordout+i) == -1) {
590 *(cwordin+i) = -1;
591 break;
592 }
593 if (pad == 2 && *(cwordin+i) == -1) {
594 *(cwordout+i) = -1;
595 break;
596 }
597 if (*(cwordin+i) == -1 && *(cwordout+i) != -1) {
598 pad = 1; /* Pad upper */
599 }
600 else if (*(cwordin+i) != -1 && *(cwordout+i) == -1) {
601 pad = 2; /* Pad lower */
602 }
603 if (pad == 1) {
604 *(cwordin+i) = EPSILON0;
605 }
606 if (pad == 2) {
607 *(cwordout+i) = EPSILON0;
608 }
609 if (pad == 0 && *(cwordin+i) == -1)
610 break;
611 }
612}
613
614void lexc_string_to_tokens(char *string, int *intarr) {
615 int len, i, pos, skip, signumber, multi;
616 unsigned int mchashval;
617 char tmpstring[5];
618 struct multichar_symbols *mcs;
619 len = strlen(string);
620 for (i=0, pos = 0; i < len; ) {
621
622 /* EPSILON for alignment is marked as 0xff */
623 if ((unsigned char) string[i] == 0xff) {
624 *(intarr+pos) = EPSILON0;
625 pos++;
626 i++;
627 continue;
628 }
629
630 multi = 0;
631 mchashval = (unsigned int) ((unsigned char) *(string+i)) * 256 + (unsigned int) ((unsigned char) *(string+i+1));
632 if ((i < len-1) && *(mchash+mchashval) == 1) {
633 for (mcs = mc; mcs != NULL((void*)0); mcs = mcs->next) {
634 if (strncmp(string+i,mcs->symbol,strlen(mcs->symbol)) == 0) {
635 /* printf("Found multichar: [%s][%i]\n",mcs->symbol,mcs->sigma_number); */
636 multi = 1;
637 break;
638 }
639 }
640 }
641
642 if (multi) {
643 *(intarr+pos) = mcs->sigma_number;
644 pos++;
645 i += strlen(mcs->symbol);
646 } else {
647 skip = utf8skip(string+i);
648 if ((signumber = lexc_find_sigma_hash(mystrncpy(tmpstring,string+i,skip+1))) != -1) {
649 *(intarr+pos) = signumber;
650 pos++;
651 i = i + skip + 1;
652 } else {
653 signumber = sigma_add(mystrncpy(tmpstring, string+i, skip+1), lexsigma);
654 lexc_add_sigma_hash(tmpstring, signumber);
655 *(intarr+pos) = signumber;
656 pos++;
657 i = i + skip + 1;
658 }
659 }
660 }
661 *(intarr+pos) = -1;
662}
663
664char *mystrncpy(char *dest, char *src, int len) {
665 int i;
666 for (i=0; i < len; i++) {
667 *(dest+i) = *(src+i);
668 if (*(src+i) == '\0')
669 return(dest);
670 }
671 *(dest+i) = '\0';
672/* printf("Mystrncpy: [%s]\n",dest); */
673 return(dest);
674}
675
676/* Add MC to front of chain */
677/* In decreasing order of length */
678
679void lexc_add_mc(char *symbol) {
680 int s, len;
681 unsigned int mchashval;
682 struct multichar_symbols *mcs, *mcprev, *mcnew;
683 lexc_deescape_string(symbol,'%',0);
684 if (!lexc_find_mc(symbol)) {
685 len = utf8strlen(symbol);
686 mcprev = NULL((void*)0);
687 for (mcs = mc; mcs != NULL((void*)0) && utf8strlen(mcs->symbol) > len; mcprev = mcs, mcs=mcs->next) {
688 }
689 mcnew = malloc(sizeof(struct multichar_symbols));
690 mcnew->symbol = strdup(symbol);
691 mcnew->next = mcs;
692 if ((mc == NULL((void*)0)) ||(mcs != NULL((void*)0) && mcprev == NULL((void*)0)))
693 mc = mcnew;
694 if (mcprev != NULL((void*)0))
695 mcprev->next = mcnew;
696
697 s = sigma_add(symbol, lexsigma);
698 mchashval = (unsigned int) ((unsigned char) *(symbol)) * 256 + (unsigned int) ((unsigned char) *(symbol+1));
699 lexc_add_sigma_hash(symbol, s);
700 *(mchash+mchashval) = 1;
701 mcnew->sigma_number = s;
702 }
703}
704
705int lexc_find_mc(char *symbol) {
706 struct multichar_symbols *mcs;
707 for (mcs = mc ; mcs != NULL((void*)0) ; mcs = mcs->next) {
708 if (strcmp(symbol,mcs->symbol) == 0)
709 return 1;
710 }
711 return 0;
712}
713
714struct states *lexc_find_lex_state(char *name) {
715 struct lexstates *l;
716 for (l = lexstates ; l != NULL((void*)0); l = l->next) {
717 if (strcmp(name,l->name) == 0)
718 return (l->state);
719 }
720 return NULL((void*)0);
721}
722
723void lexc_add_word() {
724 /** Add a word from source state to destination state */
725 struct trans *newtrans, *trans;
726 struct states *sourcestate, *deststate, *newstate;
727 int i, follow, len;
728
729 if (current_entry == REGEX_ENTRY2) {
730 lexc_add_network();
731 return;
732 }
733
734 /* find source, dest */
735 sourcestate = clexicon->state;
736 deststate = ctarget->state;
737
738 for (i=0; *(cwordin+i) != -1; i++) {}
739 len = i;
740 maxlen = len > maxlen ? len : maxlen;
741
742 /* We follow the source state if the symbols are the same */
743 /* To merge prefixes */
744 for (follow = 1, i=0; *(cwordin+i) != -1; i++) {
745
746 if (follow == 1) {
747 for (trans = sourcestate->trans; trans != NULL((void*)0) ; trans = trans->next) {
748 if (trans->in == *(cwordin+i) && trans->out == *(cwordout+i) && trans->target->lexstate == NULL((void*)0)) {
749 /* Can't follow if target needs to be lexstate */
750 if (*(cwordin+i+1) == -1 && trans->target != deststate) {
751 continue;
752 }
753 sourcestate = trans->target;
754 sourcestate->mergeable = 0;
755 /* Breakout */
756 goto breakout;
757 }
758 }
759 }
760 follow = 0;
761
762 newtrans = malloc(sizeof(struct trans));
763 if (*(cwordin+i+1) == -1) {
764 newtrans->target = deststate;
765 } else {
766 newstate = malloc(sizeof(struct states));
767 lexc_add_state(newstate);
768 newtrans->target = newstate;
769 newstate->trans = NULL((void*)0);
770 newstate->lexstate = NULL((void*)0);
771 newstate->mergeable = 1;
772 newstate->hashval = lexc_suffix_hash(i+1);
773 newstate->distance = len - i - 1;
774 newstate->merge_with = newstate;
775 }
776 newtrans->next = sourcestate->trans;
777 sourcestate->trans = newtrans;
778
779 newtrans->in = *(cwordin+i);
780 newtrans->out = *(cwordout+i);
781
782 sourcestate = newtrans->target;
783 breakout:;
784
785 }
786 return;
787}
788
789void lexc_number_states() {
790 int n, smax, hasroot;
791 struct statelist *s;
792 struct lexstates *l;
793
794 smax = n = hasfinal = 0;
795
796 for (hasroot = 0, s = statelist; s != NULL((void*)0); s = s->next) {
797 smax++;
798 if (s->state->lexstate != NULL((void*)0) && strcmp(s->state->lexstate->name, "Root") == 0) {
799 s->state->number = 0;
800 s->start = 1;
801 n++;
802 hasroot = 1;
803 break;
804 }
805 }
806 /* If there is no Root lexicon, the first lexicon mentioned is Root */
807 if (!hasroot) {
808 for (s = statelist; s != NULL((void*)0); s = s->next) {
809 if (s->next == NULL((void*)0)) {
810 s->state->number = 0;
811 if (g_verbose)
812 {
813 fprintf(stderrstderr,"*Warning: no Root lexicon, using '%s' as Root.\n",s->state->lexstate->name);
814 fflush(stderrstderr);
815 }
816 s->start = 1;
817 n++;
818 }
819 }
820 }
821 /* Mark # as the last state */
822 for (s = statelist; s != NULL((void*)0); s = s->next) {
823 if (s->state->lexstate != NULL((void*)0) && strcmp(s->state->lexstate->name, "#") == 0) {
824 s->state->number = smax-1;
825 s->final = 1;
826 hasfinal = 1;
827 } else if (s->state->lexstate != NULL((void*)0) && strcmp(s->state->lexstate->name, "#") != 0 && s->state->lexstate->has_outgoing == 0) {
828 /* Also mark uncontinued states as final (this is warned about elsewhere) */
829 s->final = 1;
830 }
831 }
832
833 for (s = statelist; s != NULL((void*)0); s = s->next) {
834 if (s->state->number == -1) {
835 s->state->number = n;
836 n++;
837 }
838 }
839 lexc_statecount = n+1;
840 for (l = lexstates; l != NULL((void*)0) ; l = l->next) {
841 if (l->targeted == 0 && l->state->number != 0) {
842 if (g_verbose)
843 {
844 fprintf(stderrstderr,"*Warning: lexicon '%s' defined but not used\n",l->name);
845 fflush(stderrstderr);
846 }
847 }
848 if (l->has_outgoing == 0 && strcmp(l->name, "#") != 0) {
849 if (g_verbose)
850 {
851 fprintf(stderrstderr,"***Warning: lexicon '%s' used but never defined\n",l->name);
852 fflush(stderrstderr);
853 }
854 }
855 }
856}
857
858int lexc_eq_paths(struct states *one, struct states *two) {
859 while (one->lexstate == NULL((void*)0) && two->lexstate == NULL((void*)0)) {
860 if (one->trans->in != two->trans->in || one->trans->out != two->trans->out)
861 return 0;
862 one = one->trans->target;
863 two = two->trans->target;
864 }
865 if (one->lexstate != two->lexstate)
866 return 0;
867 return 1;
868}
869
870void lexc_merge_states() {
871 struct lenlist {
872 struct states *state;
873 struct lenlist *next;
874 };
875 struct hashstates {
876 struct states *state;
877 struct hashstates *next;
878 } *hashstates, *currenth, *newh;
879
880 struct lenlist *lenlist, *newl, *currentl;
881 struct statelist *s, *sprev, *sf;
882 struct states *state, *purgestate;
883 struct trans *t, *tprev;
884 int i, numstates, tablesize, hash;
885
886 /* Create array of ptrs to states depending on string length */
887 lenlist = calloc(maxlen+1,sizeof(struct lenlist));
888 numstates = 0;
889 for (s = statelist ; s!= NULL((void*)0); s = s->next) {
890 if (s->state->mergeable)
891 numstates++;
892 }
893
894 /* Find a suitable prime for hashing: proportional to the size of the */
895 /* number of mergeable states */
896
897 for (i = 0; primes[i] < numstates/4; i++) { }
898 tablesize = primes[i];
899 hashstates = calloc(tablesize,sizeof(struct hashstates));
900
901 for (s = statelist ; s!= NULL((void*)0); s = s->next) {
902 if (s->state->mergeable) {
903 numstates++;
904 currentl = lenlist+(s->state->distance);
905 if (currentl->state == NULL((void*)0))
906 currentl->state = s->state;
907 else {
908 newl = calloc(1,sizeof(struct lenlist));
909 newl->state = s->state;
910 newl->next = currentl->next;
911 currentl->next = newl;
912 }
913 s->state->hashval = s->state->hashval % tablesize;
914 currenth = hashstates+s->state->hashval;
915 if (currenth->state == NULL((void*)0)) {
916 currenth->state = s->state;
917 } else {
918 newh = calloc(1,sizeof(struct hashstates));
919 newh->state = s->state;
920 newh->next = currenth->next;
921 currenth->next = newh;
922 }
923 }
924 }
925
926 for (i = maxlen; i >= 1 ; i--) {
927 /* printf("Analyzing: [%i]...",i); fflush(stdout); */
928 for (currentl = (lenlist+i); currentl != NULL((void*)0); currentl = currentl->next) {
929 if (currentl->state == NULL((void*)0))
930 break;
931 if (currentl->state->mergeable != 1)
932 continue;
933 /* Find states hashing to same value as current */
934 state = currentl->state;
935 hash = state->hashval;
936 for (currenth = hashstates+hash; currenth != NULL((void*)0); currenth = currenth->next) {
937 /* Merge */
938 if (currenth->state != state && currenth->state->mergeable == 1 && currenth->state->distance == state->distance && lexc_eq_paths(currenth->state,state)) {
939 currenth->state->merge_with = state;
940 for (purgestate = currenth->state; purgestate->lexstate == NULL((void*)0); purgestate = purgestate->trans->target) {
941 purgestate->mergeable = 2;
942 }
943 }
944 }
945 }
946 }
947
948 /* Go through statelist and remove merged states and free states, trans */
949
950 for (s = statelist, sprev = NULL((void*)0); s != NULL((void*)0); s = s->next) {
951 for (t = s->state->trans, tprev = NULL((void*)0); t != NULL((void*)0); tprev = t, t = t->next) {
952 t->target = t->target->merge_with;
953 if (tprev != NULL((void*)0) && s->state->mergeable == 2) {
954 free(tprev);
955 } else {
956 if (t->target->lexstate != NULL((void*)0))
957 t->target->lexstate->targeted = 1;
958 }
959 }
960 if (tprev != NULL((void*)0) && s->state->mergeable == 2)
961 free(tprev);
962 }
963 for (s = statelist, sprev = NULL((void*)0); s != NULL((void*)0); ) {
964 if (s->state->mergeable == 2) {
965 if (sprev != NULL((void*)0)) {
966 sprev->next = s->next;
967 } else {
968 statelist = s;
969 }
970 free(s->state);
971 sf = s;
972 s = s->next;
973 free(sf);
974 } else {
975 sprev = s;
976 s = s ->next;
977 }
978 }
979
980 /* Cleanup */
981
982 for (i = 0; i < maxlen ; i++) {
983 newl = NULL((void*)0);
984 for (currentl = (lenlist+i)->next; currentl != NULL((void*)0) ;currentl=currentl->next) {
985 if (newl != NULL((void*)0))
986 free(newl);
987 newl = currentl;
988 }
989 if (newl != NULL((void*)0))
990 free(newl);
991 }
992 for (i = 0; i < tablesize ; i++) {
993 newh = NULL((void*)0);
994 for (currenth = (hashstates+i)->next; currenth != NULL((void*)0) ;currenth=currenth->next) {
995 if (newh != NULL((void*)0))
996 free(newh);
997 newh = currenth;
998 }
999 if (newh != NULL((void*)0))
1000 free(newh);
1001 }
1002 free(hashstates);
1003 free(lenlist);
1004}
1005
1006struct fsm *lexc_to_fsm() {
1007 struct statelist *s, *sa;
1008 struct fsm_state *fsm;
1009 struct fsm *net;
1010 struct trans *t;
1011 int i, j, linecount;
1012
1013 if (g_verbose)
1014 {
1015 fprintf(stderrstderr,"Building lexicon...\n");
1016 fflush(stderrstderr);
1017 }
1018 lexc_merge_states();
1019 net = fsm_create("");
1020 free(net->sigma);
1021 net->sigma = lexsigma;
1022 lexc_number_states();
1023 if (hasfinal == 0) {
1024 if (g_verbose)
1025 {
1026 fprintf(stderrstderr,"Warning: # is never reached!!!\n");
1027 fflush(stderrstderr);
1028 }
1029 return(fsm_empty_set());
1030 }
1031 sa = malloc(sizeof(struct statelist)*lexc_statecount);
1032 for (s = statelist; s != NULL((void*)0); s = s->next) {
1033 sa[s->state->number].state = s->state;
1034 sa[s->state->number].start = s->start;
1035 sa[s->state->number].final = s->final;
1036 }
1037 linecount = 0;
1038 for (s = statelist; s != NULL((void*)0); s = s->next) {
1039 linecount++;
1040 for (t = s->state->trans; t != NULL((void*)0); t = t->next)
1041 linecount++;
1042 }
1043 fsm = malloc(sizeof(struct fsm_state)*(linecount+1));
1044 for (i = 0, j = 0, s = sa; j < lexc_statecount; j++) {
1045 if (s[j].state->trans == NULL((void*)0)) {
1046 add_fsm_arc(fsm,i,s[j].state->number, -1, -1, -1, s[j].final, s[j].start);
1047 i++;
1048 } else {
1049 for (t = s[j].state->trans; t != NULL((void*)0); t = t->next) {
1050 add_fsm_arc(fsm,i,s[j].state->number,t->in,t->out,t->target->number,s[j].final,s[j].start);
1051 i++;
1052 }
1053 }
1054 }
1055 add_fsm_arc(fsm, i, -1, -1, -1, -1, -1, -1);
1056 net->states = fsm;
1057 net->statecount = lexc_statecount;
1058 fsm_update_flags(net, UNK2, UNK2, UNK2, UNK2, UNK2, UNK2);
1059 if (sigma_find_number(EPSILON0, lexsigma) == -1)
1060 sigma_add_special(EPSILON0, lexsigma);
1061 free(s);
1062 lexc_cleanup();
1063 sigma_cleanup(net,0);
1064 sigma_sort(net);
1065
1066 if (g_verbose)
1067 {
1068 fprintf(stderrstderr,"Determinizing...\n");
1069 fflush(stderrstderr);
1070 }
1071 net = fsm_determinize(net);
1072 if (g_verbose)
1073 {
1074 fprintf(stderrstderr,"Minimizing...\n");
1075 fflush(stderrstderr);
1076 }
1077 net = fsm_topsort(fsm_minimize(net));
1078 if (g_verbose)
1079 {
1080 fprintf(stderrstderr,"Done!\n");
1081 fflush(stderrstderr);
1082 }
1083 return(net);
1084}
1085
1086void lexc_cleanup() {
1087 struct lexstates *l, *ln;
1088 struct statelist *s, *sn;
1089 struct trans *t, *tn;
1090 struct multichar_symbols *mcs, *mcsn;
1091 struct lexc_hashtable *lhash, *lprev;
1092 int i;
1093 free(mchash);
1094 for (i=0; i < SIGMA_HASH_TABLESIZE3079; i++) {
1095 for (lhash = hashtable+i; lhash != NULL((void*)0); ) {
1096 if (lhash->symbol != NULL((void*)0)) {
1097 free(lhash->symbol);
1098 }
1099 lprev = lhash;
1100 lhash = lhash->next;
1101 if (lprev != hashtable+i) { free(lprev); }
1102 }
1103 }
1104 free(hashtable);
1105 for (mcs = mc ; mcs != NULL((void*)0) ; mcs = mcsn) {
1106 mcsn = mcs->next;
1107 free(mcs->symbol);
1108 free(mcs);
1109 }
1110 for (l = lexstates ; l != NULL((void*)0) ; l = ln) {
1111 ln = l->next;
1112 free(l->name);
1113 free(l);
1114 }
1115 for (s = statelist; s != NULL((void*)0); s = s->next) {
1116 for (t = s->state->trans; t != NULL((void*)0); t = tn) {
1117 tn = t->next;
1118 free(t);
1119 }
1120 free(s->state);
1121 }
1122 for (s = statelist; s != NULL((void*)0); s = sn) {
1123 sn = s->next;
1124 free(s);
1125 }
1126}