clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name lexcread.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fcoverage-compilation-dir=/tmp/build/foma/foma-0.10.0+g279~a2d32b38 -resource-dir /usr/lib/llvm-16/lib/clang/16 -D _GNU_SOURCE -I /tmp/build/foma/foma-0.10.0+g279~a2d32b38 -internal-isystem /usr/lib/llvm-16/lib/clang/16/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -Wno-missing-field-initializers -Wno-deprecated -Wno-unused-parameter -std=c18 -fdebug-compilation-dir=/tmp/build/foma/foma-0.10.0+g279~a2d32b38 -ferror-limit 19 -fvisibility=hidden -fgnuc-version=4.2.1 -analyzer-output=html -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/build/foma/scan-build/2024-09-11-155945-2678-1 -x c /tmp/build/foma/foma-0.10.0+g279~a2d32b38/lexcread.c
1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | #include <stdio.h> |
19 | #include <stdlib.h> |
20 | #include <string.h> |
21 | #include "foma.h" |
22 | #include "lexc.h" |
23 | |
24 | #define SIGMA_HASH_TABLESIZE 3079 |
25 | |
26 | #define WORD_ENTRY 1 |
27 | #define REGEX_ENTRY 2 |
28 | |
29 | extern int g_lexc_align; |
30 | extern int g_verbose; |
31 | |
32 | struct multichar_symbols { |
33 | char *symbol; |
34 | short int sigma_number; |
35 | struct multichar_symbols *next; |
36 | }; |
37 | |
38 | struct lexstates { |
39 | char *name; |
40 | struct states *state; |
41 | struct lexstates *next; |
42 | unsigned char targeted; |
43 | unsigned char has_outgoing; |
44 | }; |
45 | |
46 | struct states { |
47 | struct trans { |
48 | short int in; |
49 | short int out; |
50 | struct states *target; |
51 | struct trans *next; |
52 | } *trans; |
53 | struct lexstates *lexstate; |
54 | int number; |
55 | unsigned int hashval; |
56 | unsigned char mergeable; |
57 | |
58 | unsigned short int distance; |
59 | struct states *merge_with; |
60 | }; |
61 | |
62 | struct statelist { |
63 | struct states *state; |
64 | struct statelist *next; |
65 | char start; |
66 | char final; |
67 | }; |
68 | |
69 | struct lexc_hashtable { |
70 | char *symbol; |
71 | struct lexc_hashtable *next; |
72 | int sigma_number; |
73 | }; |
74 | |
75 | static unsigned int primes[26] = {61,127,251,509,1021,2039,4093,8191,16381,32749,65521,131071,262139,524287,1048573,2097143,4194301,8388593,16777213,33554393,67108859,134217689,268435399,536870909,1073741789,2147483647}; |
76 | |
77 | static struct statelist *statelist = NULL; |
78 | static struct multichar_symbols *mc = NULL; |
79 | static struct lexstates *lexstates = NULL; |
80 | static struct sigma *lexsigma = NULL; |
81 | static struct lexc_hashtable *hashtable; |
82 | static struct fsm *current_regex_network; |
83 | |
84 | static int cwordin[1000], cwordout[1000], medcwordin[2000], medcwordout[2000], carity, lexc_statecount, maxlen, hasfinal, current_entry, net_has_unknown; |
85 | static _Bool *mchash; |
86 | static struct lexstates *clexicon, *ctarget; |
87 | |
88 | static char *mystrncpy(char *dest, char *src, int len); |
89 | static void lexc_string_to_tokens(char *string, int *intarr); |
90 | static void lexc_pad(); |
91 | static void lexc_medpad(); |
92 | static void lexc_number_states(); |
93 | static void lexc_cleanup(); |
94 | static unsigned int lexc_suffix_hash(int offset); |
95 | static unsigned int lexc_symbol_hash(char *s); |
96 | static void lexc_update_unknowns(int sigma_number); |
97 | |
98 | static unsigned int lexc_suffix_hash(int offset) { |
99 | register unsigned int h = 0, g, p; |
100 | |
101 | for(p = offset; cwordin[p] != -1; p++) { |
102 | h = (h << 4) + (unsigned int) (cwordin[p] | (cwordout[p] << 8)); |
103 | if (0 != (g = h & 0xf0000000)) { |
104 | h = h ^ (g >> 24); |
105 | h = h ^ g; |
106 | } |
107 | } |
108 | |
109 | return h; |
110 | } |
111 | |
112 | static unsigned int lexc_symbol_hash(char *s) { |
113 | register unsigned int hash; |
114 | int c; |
115 | hash = 5381; |
116 | while ((c = *s++)) |
117 | hash = ((hash << 5) + hash) + c; |
118 | return (hash % SIGMA_HASH_TABLESIZE); |
119 | } |
120 | |
121 | int lexc_find_sigma_hash(char *symbol) { |
122 | int ptr; |
123 | struct lexc_hashtable *h; |
124 | ptr = lexc_symbol_hash(symbol); |
125 | |
126 | if ((hashtable+ptr)->symbol == NULL) |
127 | return -1; |
128 | for (h = (hashtable+ptr); h != NULL; h = h->next) { |
129 | if (strcmp(symbol,h->symbol) == 0) { |
130 | return (h->sigma_number); |
131 | } |
132 | } |
133 | return -1; |
134 | } |
135 | |
136 | void lexc_add_sigma_hash(char *symbol, int number) { |
137 | int ptr; |
138 | struct lexc_hashtable *h, *hnew; |
139 | ptr = lexc_symbol_hash(symbol); |
140 | |
141 | if (net_has_unknown == 1) |
142 | lexc_update_unknowns(number); |
143 | |
144 | if ((hashtable+ptr)->symbol == NULL) { |
145 | (hashtable+ptr)->symbol = strdup(symbol); |
146 | (hashtable+ptr)->sigma_number = number; |
147 | return; |
148 | } |
149 | for (h = hashtable+ptr; h->next != NULL; h = h->next) { |
150 | } |
151 | hnew = malloc(sizeof(struct lexc_hashtable)); |
152 | hnew->symbol = strdup(symbol); |
153 | hnew->sigma_number = number; |
154 | h->next = hnew; |
155 | hnew->next = NULL; |
156 | } |
157 | |
158 | void lexc_init() { |
159 | int i; |
160 | lexsigma = sigma_create(); |
161 | mc = NULL; |
162 | lexstates = NULL; |
163 | clexicon = NULL; |
164 | ctarget = NULL; |
165 | statelist = NULL; |
166 | lexc_statecount = 0; |
167 | net_has_unknown = 0; |
168 | lexc_clear_current_word(); |
169 | hashtable = calloc(SIGMA_HASH_TABLESIZE, sizeof(struct lexc_hashtable)); |
170 | |
171 | maxlen = 0; |
172 | |
173 | mchash = calloc(256*256, sizeof(_Bool)); |
174 | for (i=0; i< SIGMA_HASH_TABLESIZE; i++) { |
175 | (hashtable+i)->symbol = NULL; |
176 | (hashtable+i)->sigma_number = -1; |
177 | (hashtable+i)->next = NULL; |
178 | } |
179 | } |
180 | |
181 | void lexc_clear_current_word() { |
182 | cwordin[0] = cwordout[0] = 0; |
183 | cwordin[1] = cwordout[1] = -1; |
184 | current_entry = WORD_ENTRY; |
185 | } |
186 | |
187 | void lexc_add_state(struct states *s) { |
188 | struct statelist *sl; |
189 | sl = malloc(sizeof(struct statelist)); |
190 | sl->state = s; |
191 | s->number = -1; |
192 | sl->next = statelist; |
193 | sl->start = 0; |
194 | sl->final = 0; |
195 | statelist = sl; |
196 | lexc_statecount++; |
197 | } |
198 | |
199 | |
200 | |
201 | |
202 | |
203 | |
204 | |
205 | |
206 | void lexc_update_unknowns(int sigma_number) { |
207 | struct statelist *s; |
208 | struct trans *t, *newtrans; |
209 | for (s = statelist; s != NULL; s = s->next) { |
210 | if (s->state->mergeable == 2) |
211 | continue; |
212 | for (t=s->state->trans ; t!=NULL; t= t->next) { |
213 | if (t->in == IDENTITY || t->out == IDENTITY) { |
214 | newtrans = malloc(sizeof(struct trans)); |
215 | newtrans->in = sigma_number; |
216 | newtrans->out = sigma_number; |
217 | newtrans->target = t->target; |
218 | newtrans->next = t->next; |
219 | t->next = newtrans; |
220 | } |
221 | } |
222 | } |
223 | } |
224 | |
225 | void lexc_add_network() { |
226 | |
227 | struct fsm *net; |
228 | struct fsm_state *fsm; |
229 | struct sigma *sigma; |
230 | struct states **slist, *sourcestate, *deststate, *newstate; |
231 | struct statelist *s; |
232 | struct trans *newtrans; |
233 | int i, j, *sigreplace, signumber, maxstate, *finals, unknown_symbols, first_new_sigma, *unk = NULL; |
234 | |
235 | unknown_symbols = 0; |
236 | first_new_sigma = 0; |
237 | sourcestate = clexicon->state; |
238 | deststate = ctarget->state; |
239 | |
240 | net = current_regex_network; |
241 | fsm = net->states; |
242 | |
243 | sigreplace = calloc(sigma_max(net->sigma)+1,sizeof(int)); |
| |
244 | |
245 | for (sigma = net->sigma; sigma != NULL && sigma->number != -1; sigma = sigma->next) { |
| 5 | | Assuming 'sigma' is equal to NULL | |
|
246 | if ((signumber = lexc_find_sigma_hash(sigma->symbol)) == -1) { |
247 | |
248 | signumber = sigma_add(sigma->symbol, lexsigma); |
249 | first_new_sigma = first_new_sigma > 0 ? first_new_sigma : signumber; |
250 | lexc_add_sigma_hash(sigma->symbol, signumber); |
251 | *(sigreplace+sigma->number) = signumber; |
252 | } else { |
253 | |
254 | *(sigreplace+sigma->number) = signumber; |
255 | } |
256 | } |
257 | |
258 | |
259 | for (i=0, maxstate = 0; (fsm+i)->state_no != -1; i++) { |
| 6 | | Assuming the condition is false | |
|
| 7 | | Loop condition is false. Execution continues on line 268 | |
|
260 | if ((fsm+i)->in != -1) |
261 | (fsm+i)->in = *(sigreplace+(fsm+i)->in); |
262 | if ((fsm+i)->out != -1) |
263 | (fsm+i)->out = *(sigreplace+(fsm+i)->out); |
264 | maxstate = (fsm+i)->state_no > maxstate ? (fsm+i)->state_no : maxstate; |
265 | if ((fsm+i)->in == IDENTITY || (fsm+i)->in == UNKNOWN || (fsm+i)->out == UNKNOWN) |
266 | unknown_symbols = 1; |
267 | } |
268 | if (unknown_symbols == 1) { |
| 8 | | Potential leak of memory pointed to by 'sigreplace' |
|
269 | unk = calloc(sigma_max(lexsigma)+2,sizeof(int)); |
270 | for (i=0, sigma = lexsigma; sigma != NULL && sigma->number != -1; sigma=sigma->next) { |
271 | if (sigma->number > 2 && sigma_find(sigma->symbol, net->sigma) == -1) { |
272 | *(unk+i) = sigma->number; |
273 | i++; |
274 | } |
275 | } |
276 | } |
277 | |
278 | slist = calloc(sizeof(**slist),maxstate+1); |
279 | finals = calloc(sizeof(int),maxstate+1); |
280 | |
281 | for (i=0; i <= maxstate;i++) { |
282 | newstate = malloc(sizeof(struct states)); |
283 | *(slist+i) = newstate; |
284 | newstate->trans = NULL; |
285 | newstate->lexstate = NULL; |
286 | newstate->number = -1; |
287 | newstate->hashval = -1; |
288 | newstate->mergeable = 0; |
289 | newstate->distance = 0; |
290 | newstate->merge_with = newstate; |
291 | s = malloc(sizeof(struct statelist)); |
292 | s->state = newstate; |
293 | s->next = statelist; |
294 | s->start = 0; |
295 | s->final = 0; |
296 | statelist = s; |
297 | } |
298 | |
299 | newtrans = malloc(sizeof(struct trans)); |
300 | newtrans->in = EPSILON; |
301 | newtrans->out = EPSILON; |
302 | newtrans->target = *slist; |
303 | newtrans->next = sourcestate->trans; |
304 | sourcestate->trans = newtrans; |
305 | |
306 | for (i=0; (fsm+i)->state_no != -1; i++) { |
307 | if ((fsm+i)->target != -1) { |
308 | newstate = *(slist+(fsm+i)->state_no); |
309 | newtrans = malloc(sizeof(struct trans)); |
310 | newtrans->in = (fsm+i)->in; |
311 | newtrans->out = (fsm+i)->out; |
312 | newtrans->target = *(slist+(fsm+i)->target); |
313 | newtrans->next = newstate->trans; |
314 | newstate->trans = newtrans; |
315 | |
316 | |
317 | if (unknown_symbols == 1) { |
318 | if ((fsm+i)->in == IDENTITY || (fsm+i)->out == IDENTITY) { |
319 | for (j=0; *(unk+j) != 0; j++) { |
320 | newtrans = malloc(sizeof(struct trans)); |
321 | newtrans->in = *(unk+j); |
322 | newtrans->out = *(unk+j); |
323 | newtrans->target = *(slist+(fsm+i)->target); |
324 | newtrans->next = newstate->trans; |
325 | newstate->trans = newtrans; |
326 | } |
327 | } |
328 | } |
329 | } |
330 | finals[(fsm+i)->state_no] = (fsm+i)->final_state; |
331 | } |
332 | |
333 | for (i=0; i <= maxstate; i++) { |
334 | if (finals[i] == 1) { |
335 | newtrans = malloc(sizeof(struct trans)); |
336 | newtrans->in = newtrans->out = EPSILON; |
337 | newtrans->target = deststate; |
338 | newstate = *(slist+i); |
339 | newtrans->next = newstate->trans; |
340 | newstate->trans = newtrans; |
341 | } |
342 | } |
343 | if (unknown_symbols == 1) { |
344 | free(unk); |
345 | net_has_unknown = 1; |
346 | } |
347 | free(slist); |
348 | free(finals); |
349 | } |
350 | |
351 | void lexc_set_network(struct fsm *net) { |
352 | current_regex_network = net; |
353 | current_entry = REGEX_ENTRY; |
354 | return; |
355 | } |
356 | |
357 | void lexc_set_current_lexicon(char *name, int which) { |
358 | |
359 | |
360 | |
361 | struct lexstates *l; |
362 | struct states *newstate; |
363 | |
364 | for (l = lexstates; l != NULL; l = l->next) { |
365 | if (strcmp(name,l->name) == 0) { |
366 | if (which == 0) { |
367 | l->has_outgoing = 1; |
368 | clexicon = l; |
369 | } else { |
370 | ctarget = l; |
371 | } |
372 | return; |
373 | } |
374 | } |
375 | l = malloc(sizeof(struct lexstates)); |
376 | l->next = lexstates; |
377 | l->name = strdup(name); |
378 | l->has_outgoing = 0; |
379 | l->targeted = 0; |
380 | lexstates = l; |
381 | newstate = malloc(sizeof(struct states)); |
382 | lexc_add_state(newstate); |
383 | newstate->lexstate = l; |
384 | newstate->trans = NULL; |
385 | newstate->mergeable = 0; |
386 | newstate->merge_with = newstate; |
387 | l->state = newstate; |
388 | if (which == 0) { |
389 | clexicon = l; |
390 | l->has_outgoing = 1; |
391 | } else { |
392 | ctarget = l; |
393 | } |
394 | } |
395 | |
396 | char *lexc_find_delim(char *name, char delimiter, char escape) { |
397 | int i; |
398 | for (i=0; *(name+i) != '\0'; i++) { |
399 | if (*(name+i) == escape && *(name+i+1) != '\0') { |
400 | i++; |
401 | continue; |
402 | } |
403 | if (*(name+i) == delimiter) { |
404 | return name+i; |
405 | } |
406 | } |
407 | return NULL; |
408 | } |
409 | |
410 | void lexc_deescape_string(char *name, char escape, int mode) { |
411 | int i, j; |
412 | for (i=0, j=0; *(name+i) != '\0'; i++) { |
413 | *(name+j) = *(name+i); |
414 | if (*(name+i) == escape) { |
415 | *(name+j) = *(name+i+1); |
416 | j++; |
417 | i++; |
418 | continue; |
419 | } |
420 | else if (mode == 1 && *(name+i) == '0') { |
421 | |
422 | *(name+j) = (unsigned char) 0xff; |
423 | j++; |
424 | continue; |
425 | } |
426 | else if (*(name+i) != escape && *(name+i) != '0') { |
427 | j++; |
428 | continue; |
429 | } |
430 | } |
431 | *(name+j) = '\0'; |
432 | } |
433 | |
434 | |
435 | |
436 | |
437 | void lexc_set_current_word(char *name) { |
438 | char *instring, *outstring; |
439 | int i; |
440 | |
441 | carity = 1; |
442 | instring = name; |
443 | outstring = lexc_find_delim(name,':','%'); |
444 | |
445 | if (outstring != NULL) { |
446 | *outstring = '\0'; |
447 | outstring = outstring+1; |
448 | lexc_deescape_string(outstring,'%',1); |
449 | carity = 2; |
450 | } |
451 | lexc_deescape_string(instring, '%',1); |
452 | |
453 | |
454 | lexc_string_to_tokens(instring, cwordin); |
455 | |
456 | if (carity == 2) { |
457 | lexc_string_to_tokens(outstring, cwordout); |
458 | if (g_lexc_align) |
459 | lexc_medpad(); |
460 | else |
461 | lexc_pad(); |
462 | } else { |
463 | for (i=0; *(cwordin+i) != -1; i++) { |
464 | *(cwordout+i) = *(cwordin+i); |
465 | } |
466 | *(cwordout+i) = -1; |
467 | |
468 | } |
469 | current_entry = WORD_ENTRY; |
470 | } |
471 | |
472 | |
473 | #define LEV_DOWN 0 |
474 | #define LEV_LEFT 1 |
475 | #define LEV_DIAG 2 |
476 | |
477 | void lexc_medpad() { |
478 | int i, j, x, y, s1len, s2len, left, down, diag, dir; |
479 | |
480 | if (*cwordin == -1 && *cwordout == -1) { |
481 | *cwordin = *cwordout = EPSILON; |
482 | *(cwordin+1) = *(cwordout+1) = -1; |
483 | return; |
484 | } |
485 | |
486 | for (i = 0, j = 0; cwordin[i] != -1; i++) { |
487 | if (cwordin[i] == EPSILON) { |
488 | continue; |
489 | } |
490 | cwordin[j] = cwordin[i]; |
491 | j++; |
492 | } |
493 | cwordin[j] = -1; |
494 | |
495 | for (i = 0, j = 0; cwordout[i] != -1; i++) { |
496 | if (cwordout[i] == EPSILON) { |
497 | continue; |
498 | } |
499 | cwordout[j] = cwordout[i]; |
500 | j++; |
501 | } |
502 | cwordout[j] = -1; |
503 | |
504 | for (i = 0; cwordin[i] != -1; i++) { } |
505 | s1len = i; |
506 | for (i = 0; cwordout[i] != -1; i++) { } |
507 | s2len = i; |
508 | |
509 | int **matrix = calloc(s1len + 2, sizeof(int*)); |
510 | int** dirmatrix = calloc(s1len + 2, sizeof(int*)); |
511 | for (size_t i = 0; i < s1len + 2; ++i) { |
512 | matrix[i] = calloc(s2len + 2, sizeof(int)); |
513 | dirmatrix[i] = calloc(s2len + 2, sizeof(int)); |
514 | } |
515 | |
516 | matrix[0][0] = 0; |
517 | dirmatrix[0][0] = 0; |
518 | for (x = 1; x <= s1len; x++) { |
519 | matrix[x][0] = matrix[x-1][0] + 1; |
520 | dirmatrix[x][0] = LEV_LEFT; |
521 | } |
522 | for (y = 1; y <= s2len; y++) { |
523 | matrix[0][y] = matrix[0][y-1] + 1; |
524 | dirmatrix[0][y] = LEV_DOWN; |
525 | } |
526 | for (x = 1; x <= s1len; x++) { |
527 | for (y = 1; y <= s2len; y++) { |
528 | diag = matrix[x-1][y-1] + (cwordin[x-1] == cwordout[y-1] ? 0 : 100); |
529 | down = matrix[x][y-1] + 1; |
530 | left = matrix[x-1][y] + 1; |
531 | if (diag <= left && diag <= down) { |
532 | matrix[x][y] = diag; |
533 | dirmatrix[x][y] = LEV_DIAG; |
534 | } else if (left <= diag && left <= down) { |
535 | matrix[x][y] = left; |
536 | dirmatrix[x][y] = LEV_LEFT; |
537 | } else { |
538 | matrix[x][y] = down ; |
539 | dirmatrix[x][y] = LEV_DOWN; |
540 | } |
541 | } |
542 | } |
543 | |
544 | for (x = s1len, y = s2len, i = 0; (x > 0) || (y > 0); i++) { |
545 | dir = dirmatrix[x][y]; |
546 | if (dir == LEV_DIAG) { |
547 | medcwordin[i] = cwordin[x-1]; |
548 | medcwordout[i] = cwordout[y-1]; |
549 | x--; |
550 | y--; |
551 | } |
552 | else if (dir == LEV_DOWN) { |
553 | medcwordin[i] = EPSILON; |
554 | medcwordout[i] = cwordout[y-1]; |
555 | y--; |
556 | } |
557 | else { |
558 | medcwordin[i] = cwordin[x-1]; |
559 | medcwordout[i] = EPSILON; |
560 | x--; |
561 | } |
562 | } |
563 | for (j = 0, i-= 1; i >= 0; j++, i--) { |
564 | cwordin[j] = medcwordin[i]; |
565 | cwordout[j] = medcwordout[i]; |
566 | } |
567 | cwordin[j] = -1; |
568 | cwordout[j] = -1; |
569 | |
570 | for (size_t i = 0; i < s1len + 2; ++i) { |
571 | free(matrix[i]); |
572 | free(dirmatrix[i]); |
573 | } |
574 | free(matrix); |
575 | free(dirmatrix); |
576 | } |
577 | |
578 | void lexc_pad() { |
579 | int i, pad; |
580 | |
581 | |
582 | if (*cwordin == -1 && *cwordout == -1) { |
583 | *cwordin = *cwordout = EPSILON; |
584 | *(cwordin+1) = *(cwordout+1) = -1; |
585 | return; |
586 | } |
587 | |
588 | for (i=0, pad = 0; ;i++) { |
589 | if (pad == 1 && *(cwordout+i) == -1) { |
590 | *(cwordin+i) = -1; |
591 | break; |
592 | } |
593 | if (pad == 2 && *(cwordin+i) == -1) { |
594 | *(cwordout+i) = -1; |
595 | break; |
596 | } |
597 | if (*(cwordin+i) == -1 && *(cwordout+i) != -1) { |
598 | pad = 1; |
599 | } |
600 | else if (*(cwordin+i) != -1 && *(cwordout+i) == -1) { |
601 | pad = 2; |
602 | } |
603 | if (pad == 1) { |
604 | *(cwordin+i) = EPSILON; |
605 | } |
606 | if (pad == 2) { |
607 | *(cwordout+i) = EPSILON; |
608 | } |
609 | if (pad == 0 && *(cwordin+i) == -1) |
610 | break; |
611 | } |
612 | } |
613 | |
614 | void lexc_string_to_tokens(char *string, int *intarr) { |
615 | int len, i, pos, skip, signumber, multi; |
616 | unsigned int mchashval; |
617 | char tmpstring[5]; |
618 | struct multichar_symbols *mcs; |
619 | len = strlen(string); |
620 | for (i=0, pos = 0; i < len; ) { |
621 | |
622 | |
623 | if ((unsigned char) string[i] == 0xff) { |
624 | *(intarr+pos) = EPSILON; |
625 | pos++; |
626 | i++; |
627 | continue; |
628 | } |
629 | |
630 | multi = 0; |
631 | mchashval = (unsigned int) ((unsigned char) *(string+i)) * 256 + (unsigned int) ((unsigned char) *(string+i+1)); |
632 | if ((i < len-1) && *(mchash+mchashval) == 1) { |
633 | for (mcs = mc; mcs != NULL; mcs = mcs->next) { |
634 | if (strncmp(string+i,mcs->symbol,strlen(mcs->symbol)) == 0) { |
635 | |
636 | multi = 1; |
637 | break; |
638 | } |
639 | } |
640 | } |
641 | |
642 | if (multi) { |
643 | *(intarr+pos) = mcs->sigma_number; |
644 | pos++; |
645 | i += strlen(mcs->symbol); |
646 | } else { |
647 | skip = utf8skip(string+i); |
648 | if ((signumber = lexc_find_sigma_hash(mystrncpy(tmpstring,string+i,skip+1))) != -1) { |
649 | *(intarr+pos) = signumber; |
650 | pos++; |
651 | i = i + skip + 1; |
652 | } else { |
653 | signumber = sigma_add(mystrncpy(tmpstring, string+i, skip+1), lexsigma); |
654 | lexc_add_sigma_hash(tmpstring, signumber); |
655 | *(intarr+pos) = signumber; |
656 | pos++; |
657 | i = i + skip + 1; |
658 | } |
659 | } |
660 | } |
661 | *(intarr+pos) = -1; |
662 | } |
663 | |
664 | char *mystrncpy(char *dest, char *src, int len) { |
665 | int i; |
666 | for (i=0; i < len; i++) { |
667 | *(dest+i) = *(src+i); |
668 | if (*(src+i) == '\0') |
669 | return(dest); |
670 | } |
671 | *(dest+i) = '\0'; |
672 | |
673 | return(dest); |
674 | } |
675 | |
676 | |
677 | |
678 | |
679 | void lexc_add_mc(char *symbol) { |
680 | int s, len; |
681 | unsigned int mchashval; |
682 | struct multichar_symbols *mcs, *mcprev, *mcnew; |
683 | lexc_deescape_string(symbol,'%',0); |
684 | if (!lexc_find_mc(symbol)) { |
685 | len = utf8strlen(symbol); |
686 | mcprev = NULL; |
687 | for (mcs = mc; mcs != NULL && utf8strlen(mcs->symbol) > len; mcprev = mcs, mcs=mcs->next) { |
688 | } |
689 | mcnew = malloc(sizeof(struct multichar_symbols)); |
690 | mcnew->symbol = strdup(symbol); |
691 | mcnew->next = mcs; |
692 | if ((mc == NULL) ||(mcs != NULL && mcprev == NULL)) |
693 | mc = mcnew; |
694 | if (mcprev != NULL) |
695 | mcprev->next = mcnew; |
696 | |
697 | s = sigma_add(symbol, lexsigma); |
698 | mchashval = (unsigned int) ((unsigned char) *(symbol)) * 256 + (unsigned int) ((unsigned char) *(symbol+1)); |
699 | lexc_add_sigma_hash(symbol, s); |
700 | *(mchash+mchashval) = 1; |
701 | mcnew->sigma_number = s; |
702 | } |
703 | } |
704 | |
705 | int lexc_find_mc(char *symbol) { |
706 | struct multichar_symbols *mcs; |
707 | for (mcs = mc ; mcs != NULL ; mcs = mcs->next) { |
708 | if (strcmp(symbol,mcs->symbol) == 0) |
709 | return 1; |
710 | } |
711 | return 0; |
712 | } |
713 | |
714 | struct states *lexc_find_lex_state(char *name) { |
715 | struct lexstates *l; |
716 | for (l = lexstates ; l != NULL; l = l->next) { |
717 | if (strcmp(name,l->name) == 0) |
718 | return (l->state); |
719 | } |
720 | return NULL; |
721 | } |
722 | |
723 | void lexc_add_word() { |
724 | |
725 | struct trans *newtrans, *trans; |
726 | struct states *sourcestate, *deststate, *newstate; |
727 | int i, follow, len; |
728 | |
729 | if (current_entry == REGEX_ENTRY) { |
| 1 | Assuming 'current_entry' is equal to REGEX_ENTRY | |
|
| |
730 | lexc_add_network(); |
| 3 | | Calling 'lexc_add_network' | |
|
731 | return; |
732 | } |
733 | |
734 | |
735 | sourcestate = clexicon->state; |
736 | deststate = ctarget->state; |
737 | |
738 | for (i=0; *(cwordin+i) != -1; i++) {} |
739 | len = i; |
740 | maxlen = len > maxlen ? len : maxlen; |
741 | |
742 | |
743 | |
744 | for (follow = 1, i=0; *(cwordin+i) != -1; i++) { |
745 | |
746 | if (follow == 1) { |
747 | for (trans = sourcestate->trans; trans != NULL ; trans = trans->next) { |
748 | if (trans->in == *(cwordin+i) && trans->out == *(cwordout+i) && trans->target->lexstate == NULL) { |
749 | |
750 | if (*(cwordin+i+1) == -1 && trans->target != deststate) { |
751 | continue; |
752 | } |
753 | sourcestate = trans->target; |
754 | sourcestate->mergeable = 0; |
755 | |
756 | goto breakout; |
757 | } |
758 | } |
759 | } |
760 | follow = 0; |
761 | |
762 | newtrans = malloc(sizeof(struct trans)); |
763 | if (*(cwordin+i+1) == -1) { |
764 | newtrans->target = deststate; |
765 | } else { |
766 | newstate = malloc(sizeof(struct states)); |
767 | lexc_add_state(newstate); |
768 | newtrans->target = newstate; |
769 | newstate->trans = NULL; |
770 | newstate->lexstate = NULL; |
771 | newstate->mergeable = 1; |
772 | newstate->hashval = lexc_suffix_hash(i+1); |
773 | newstate->distance = len - i - 1; |
774 | newstate->merge_with = newstate; |
775 | } |
776 | newtrans->next = sourcestate->trans; |
777 | sourcestate->trans = newtrans; |
778 | |
779 | newtrans->in = *(cwordin+i); |
780 | newtrans->out = *(cwordout+i); |
781 | |
782 | sourcestate = newtrans->target; |
783 | breakout:; |
784 | |
785 | } |
786 | return; |
787 | } |
788 | |
789 | void lexc_number_states() { |
790 | int n, smax, hasroot; |
791 | struct statelist *s; |
792 | struct lexstates *l; |
793 | |
794 | smax = n = hasfinal = 0; |
795 | |
796 | for (hasroot = 0, s = statelist; s != NULL; s = s->next) { |
797 | smax++; |
798 | if (s->state->lexstate != NULL && strcmp(s->state->lexstate->name, "Root") == 0) { |
799 | s->state->number = 0; |
800 | s->start = 1; |
801 | n++; |
802 | hasroot = 1; |
803 | break; |
804 | } |
805 | } |
806 | |
807 | if (!hasroot) { |
808 | for (s = statelist; s != NULL; s = s->next) { |
809 | if (s->next == NULL) { |
810 | s->state->number = 0; |
811 | if (g_verbose) |
812 | { |
813 | fprintf(stderr,"*Warning: no Root lexicon, using '%s' as Root.\n",s->state->lexstate->name); |
814 | fflush(stderr); |
815 | } |
816 | s->start = 1; |
817 | n++; |
818 | } |
819 | } |
820 | } |
821 | |
822 | for (s = statelist; s != NULL; s = s->next) { |
823 | if (s->state->lexstate != NULL && strcmp(s->state->lexstate->name, "#") == 0) { |
824 | s->state->number = smax-1; |
825 | s->final = 1; |
826 | hasfinal = 1; |
827 | } else if (s->state->lexstate != NULL && strcmp(s->state->lexstate->name, "#") != 0 && s->state->lexstate->has_outgoing == 0) { |
828 | |
829 | s->final = 1; |
830 | } |
831 | } |
832 | |
833 | for (s = statelist; s != NULL; s = s->next) { |
834 | if (s->state->number == -1) { |
835 | s->state->number = n; |
836 | n++; |
837 | } |
838 | } |
839 | lexc_statecount = n+1; |
840 | for (l = lexstates; l != NULL ; l = l->next) { |
841 | if (l->targeted == 0 && l->state->number != 0) { |
842 | if (g_verbose) |
843 | { |
844 | fprintf(stderr,"*Warning: lexicon '%s' defined but not used\n",l->name); |
845 | fflush(stderr); |
846 | } |
847 | } |
848 | if (l->has_outgoing == 0 && strcmp(l->name, "#") != 0) { |
849 | if (g_verbose) |
850 | { |
851 | fprintf(stderr,"***Warning: lexicon '%s' used but never defined\n",l->name); |
852 | fflush(stderr); |
853 | } |
854 | } |
855 | } |
856 | } |
857 | |
858 | int lexc_eq_paths(struct states *one, struct states *two) { |
859 | while (one->lexstate == NULL && two->lexstate == NULL) { |
860 | if (one->trans->in != two->trans->in || one->trans->out != two->trans->out) |
861 | return 0; |
862 | one = one->trans->target; |
863 | two = two->trans->target; |
864 | } |
865 | if (one->lexstate != two->lexstate) |
866 | return 0; |
867 | return 1; |
868 | } |
869 | |
870 | void lexc_merge_states() { |
871 | struct lenlist { |
872 | struct states *state; |
873 | struct lenlist *next; |
874 | }; |
875 | struct hashstates { |
876 | struct states *state; |
877 | struct hashstates *next; |
878 | } *hashstates, *currenth, *newh; |
879 | |
880 | struct lenlist *lenlist, *newl, *currentl; |
881 | struct statelist *s, *sprev, *sf; |
882 | struct states *state, *purgestate; |
883 | struct trans *t, *tprev; |
884 | int i, numstates, tablesize, hash; |
885 | |
886 | |
887 | lenlist = calloc(maxlen+1,sizeof(struct lenlist)); |
888 | numstates = 0; |
889 | for (s = statelist ; s!= NULL; s = s->next) { |
890 | if (s->state->mergeable) |
891 | numstates++; |
892 | } |
893 | |
894 | |
895 | |
896 | |
897 | for (i = 0; primes[i] < numstates/4; i++) { } |
898 | tablesize = primes[i]; |
899 | hashstates = calloc(tablesize,sizeof(struct hashstates)); |
900 | |
901 | for (s = statelist ; s!= NULL; s = s->next) { |
902 | if (s->state->mergeable) { |
903 | numstates++; |
904 | currentl = lenlist+(s->state->distance); |
905 | if (currentl->state == NULL) |
906 | currentl->state = s->state; |
907 | else { |
908 | newl = calloc(1,sizeof(struct lenlist)); |
909 | newl->state = s->state; |
910 | newl->next = currentl->next; |
911 | currentl->next = newl; |
912 | } |
913 | s->state->hashval = s->state->hashval % tablesize; |
914 | currenth = hashstates+s->state->hashval; |
915 | if (currenth->state == NULL) { |
916 | currenth->state = s->state; |
917 | } else { |
918 | newh = calloc(1,sizeof(struct hashstates)); |
919 | newh->state = s->state; |
920 | newh->next = currenth->next; |
921 | currenth->next = newh; |
922 | } |
923 | } |
924 | } |
925 | |
926 | for (i = maxlen; i >= 1 ; i--) { |
927 | |
928 | for (currentl = (lenlist+i); currentl != NULL; currentl = currentl->next) { |
929 | if (currentl->state == NULL) |
930 | break; |
931 | if (currentl->state->mergeable != 1) |
932 | continue; |
933 | |
934 | state = currentl->state; |
935 | hash = state->hashval; |
936 | for (currenth = hashstates+hash; currenth != NULL; currenth = currenth->next) { |
937 | |
938 | if (currenth->state != state && currenth->state->mergeable == 1 && currenth->state->distance == state->distance && lexc_eq_paths(currenth->state,state)) { |
939 | currenth->state->merge_with = state; |
940 | for (purgestate = currenth->state; purgestate->lexstate == NULL; purgestate = purgestate->trans->target) { |
941 | purgestate->mergeable = 2; |
942 | } |
943 | } |
944 | } |
945 | } |
946 | } |
947 | |
948 | |
949 | |
950 | for (s = statelist, sprev = NULL; s != NULL; s = s->next) { |
951 | for (t = s->state->trans, tprev = NULL; t != NULL; tprev = t, t = t->next) { |
952 | t->target = t->target->merge_with; |
953 | if (tprev != NULL && s->state->mergeable == 2) { |
954 | free(tprev); |
955 | } else { |
956 | if (t->target->lexstate != NULL) |
957 | t->target->lexstate->targeted = 1; |
958 | } |
959 | } |
960 | if (tprev != NULL && s->state->mergeable == 2) |
961 | free(tprev); |
962 | } |
963 | for (s = statelist, sprev = NULL; s != NULL; ) { |
964 | if (s->state->mergeable == 2) { |
965 | if (sprev != NULL) { |
966 | sprev->next = s->next; |
967 | } else { |
968 | statelist = s; |
969 | } |
970 | free(s->state); |
971 | sf = s; |
972 | s = s->next; |
973 | free(sf); |
974 | } else { |
975 | sprev = s; |
976 | s = s ->next; |
977 | } |
978 | } |
979 | |
980 | |
981 | |
982 | for (i = 0; i < maxlen ; i++) { |
983 | newl = NULL; |
984 | for (currentl = (lenlist+i)->next; currentl != NULL ;currentl=currentl->next) { |
985 | if (newl != NULL) |
986 | free(newl); |
987 | newl = currentl; |
988 | } |
989 | if (newl != NULL) |
990 | free(newl); |
991 | } |
992 | for (i = 0; i < tablesize ; i++) { |
993 | newh = NULL; |
994 | for (currenth = (hashstates+i)->next; currenth != NULL ;currenth=currenth->next) { |
995 | if (newh != NULL) |
996 | free(newh); |
997 | newh = currenth; |
998 | } |
999 | if (newh != NULL) |
1000 | free(newh); |
1001 | } |
1002 | free(hashstates); |
1003 | free(lenlist); |
1004 | } |
1005 | |
1006 | struct fsm *lexc_to_fsm() { |
1007 | struct statelist *s, *sa; |
1008 | struct fsm_state *fsm; |
1009 | struct fsm *net; |
1010 | struct trans *t; |
1011 | int i, j, linecount; |
1012 | |
1013 | if (g_verbose) |
1014 | { |
1015 | fprintf(stderr,"Building lexicon...\n"); |
1016 | fflush(stderr); |
1017 | } |
1018 | lexc_merge_states(); |
1019 | net = fsm_create(""); |
1020 | free(net->sigma); |
1021 | net->sigma = lexsigma; |
1022 | lexc_number_states(); |
1023 | if (hasfinal == 0) { |
1024 | if (g_verbose) |
1025 | { |
1026 | fprintf(stderr,"Warning: # is never reached!!!\n"); |
1027 | fflush(stderr); |
1028 | } |
1029 | return(fsm_empty_set()); |
1030 | } |
1031 | sa = malloc(sizeof(struct statelist)*lexc_statecount); |
1032 | for (s = statelist; s != NULL; s = s->next) { |
1033 | sa[s->state->number].state = s->state; |
1034 | sa[s->state->number].start = s->start; |
1035 | sa[s->state->number].final = s->final; |
1036 | } |
1037 | linecount = 0; |
1038 | for (s = statelist; s != NULL; s = s->next) { |
1039 | linecount++; |
1040 | for (t = s->state->trans; t != NULL; t = t->next) |
1041 | linecount++; |
1042 | } |
1043 | fsm = malloc(sizeof(struct fsm_state)*(linecount+1)); |
1044 | for (i = 0, j = 0, s = sa; j < lexc_statecount; j++) { |
1045 | if (s[j].state->trans == NULL) { |
1046 | add_fsm_arc(fsm,i,s[j].state->number, -1, -1, -1, s[j].final, s[j].start); |
1047 | i++; |
1048 | } else { |
1049 | for (t = s[j].state->trans; t != NULL; t = t->next) { |
1050 | add_fsm_arc(fsm,i,s[j].state->number,t->in,t->out,t->target->number,s[j].final,s[j].start); |
1051 | i++; |
1052 | } |
1053 | } |
1054 | } |
1055 | add_fsm_arc(fsm, i, -1, -1, -1, -1, -1, -1); |
1056 | net->states = fsm; |
1057 | net->statecount = lexc_statecount; |
1058 | fsm_update_flags(net, UNK, UNK, UNK, UNK, UNK, UNK); |
1059 | if (sigma_find_number(EPSILON, lexsigma) == -1) |
1060 | sigma_add_special(EPSILON, lexsigma); |
1061 | free(s); |
1062 | lexc_cleanup(); |
1063 | sigma_cleanup(net,0); |
1064 | sigma_sort(net); |
1065 | |
1066 | if (g_verbose) |
1067 | { |
1068 | fprintf(stderr,"Determinizing...\n"); |
1069 | fflush(stderr); |
1070 | } |
1071 | net = fsm_determinize(net); |
1072 | if (g_verbose) |
1073 | { |
1074 | fprintf(stderr,"Minimizing...\n"); |
1075 | fflush(stderr); |
1076 | } |
1077 | net = fsm_topsort(fsm_minimize(net)); |
1078 | if (g_verbose) |
1079 | { |
1080 | fprintf(stderr,"Done!\n"); |
1081 | fflush(stderr); |
1082 | } |
1083 | return(net); |
1084 | } |
1085 | |
1086 | void lexc_cleanup() { |
1087 | struct lexstates *l, *ln; |
1088 | struct statelist *s, *sn; |
1089 | struct trans *t, *tn; |
1090 | struct multichar_symbols *mcs, *mcsn; |
1091 | struct lexc_hashtable *lhash, *lprev; |
1092 | int i; |
1093 | free(mchash); |
1094 | for (i=0; i < SIGMA_HASH_TABLESIZE; i++) { |
1095 | for (lhash = hashtable+i; lhash != NULL; ) { |
1096 | if (lhash->symbol != NULL) { |
1097 | free(lhash->symbol); |
1098 | } |
1099 | lprev = lhash; |
1100 | lhash = lhash->next; |
1101 | if (lprev != hashtable+i) { free(lprev); } |
1102 | } |
1103 | } |
1104 | free(hashtable); |
1105 | for (mcs = mc ; mcs != NULL ; mcs = mcsn) { |
1106 | mcsn = mcs->next; |
1107 | free(mcs->symbol); |
1108 | free(mcs); |
1109 | } |
1110 | for (l = lexstates ; l != NULL ; l = ln) { |
1111 | ln = l->next; |
1112 | free(l->name); |
1113 | free(l); |
1114 | } |
1115 | for (s = statelist; s != NULL; s = s->next) { |
1116 | for (t = s->state->trans; t != NULL; t = tn) { |
1117 | tn = t->next; |
1118 | free(t); |
1119 | } |
1120 | free(s->state); |
1121 | } |
1122 | for (s = statelist; s != NULL; s = sn) { |
1123 | sn = s->next; |
1124 | free(s); |
1125 | } |
1126 | } |