File: | spelling.c |
Warning: | line 416, column 28 The left operand of '==' is a garbage value |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* Foma: a finite-state toolkit and library. */ | |||
2 | /* Copyright © 2008-2021 Mans Hulden */ | |||
3 | ||||
4 | /* This file is part of foma. */ | |||
5 | ||||
6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ | |||
7 | /* you may not use this file except in compliance with the License. */ | |||
8 | /* You may obtain a copy of the License at */ | |||
9 | ||||
10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ | |||
11 | ||||
12 | /* Unless required by applicable law or agreed to in writing, software */ | |||
13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ | |||
14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ | |||
15 | /* See the License for the specific language governing permissions and */ | |||
16 | /* limitations under the License. */ | |||
17 | ||||
18 | #include <stdio.h> | |||
19 | #include <stdlib.h> | |||
20 | #include <string.h> | |||
21 | #include <limits.h> | |||
22 | #include "foma.h" | |||
23 | ||||
24 | #define INITIAL_AGENDA_SIZE256 256 | |||
25 | #define INITIAL_HEAP_SIZE256 256 | |||
26 | #define INITIAL_STRING_SIZE256 256 | |||
27 | #define MED_DEFAULT_LIMIT4 4 /* Default max words to find */ | |||
28 | #define MED_DEFAULT_CUTOFF15 15 /* Default MED cost cutoff */ | |||
29 | #define MED_DEFAULT_MAX_HEAP_SIZE262145 262145 /* By default won't grow heap more than this */ | |||
30 | ||||
31 | #define BITMASK(b)(1 << ((b) & 7)) (1 << ((b) & 7)) | |||
32 | #define BITSLOT(b)((b) >> 3) ((b) >> 3) | |||
33 | #define BITSET(a,b)((a)[((b) >> 3)] |= (1 << ((b) & 7))) ((a)[BITSLOT(b)((b) >> 3)] |= BITMASK(b)(1 << ((b) & 7))) | |||
34 | #define BITCLEAR(a,b)((a)[((b) >> 3)] &= ~(1 << ((b) & 7))) ((a)[BITSLOT(b)((b) >> 3)] &= ~BITMASK(b)(1 << ((b) & 7))) | |||
35 | #define BITTEST(a,b)((a)[((b) >> 3)] & (1 << ((b) & 7))) ((a)[BITSLOT(b)((b) >> 3)] & BITMASK(b)(1 << ((b) & 7))) | |||
36 | #define BITNSLOTS(nb)((nb + 8 - 1) / 8) ((nb + CHAR_BIT8 - 1) / CHAR_BIT8) | |||
37 | #define min_(X, Y)((X) < (Y) ? (X) : (Y)) ((X) < (Y) ? (X) : (Y)) | |||
38 | ||||
39 | static int calculate_h(struct apply_med_handle *medh, int *intword, int currpos, int state); | |||
40 | static struct astarnode *node_delete_min(struct apply_med_handle *medh); | |||
41 | int node_insert(struct apply_med_handle *medh, int wordpos, int fsmstate, int g, int h, int in, int out, int parent); | |||
42 | ||||
43 | char *print_sym(int sym, struct sigma *sigma) { | |||
44 | while (sigma != NULL((void*)0)) { | |||
45 | if (sigma->number == sym) { | |||
46 | return(sigma->symbol); | |||
47 | } | |||
48 | sigma = sigma->next; | |||
49 | } | |||
50 | return NULL((void*)0); | |||
51 | } | |||
52 | ||||
53 | void print_match(struct apply_med_handle *medh, struct astarnode *node, struct sigma *sigma, char *word) { | |||
54 | int sym, i, wordlen , printptr; | |||
55 | struct astarnode *n; | |||
56 | int_stack_clear(); | |||
57 | wordlen = medh->wordlen; | |||
58 | for (n = node; n != NULL((void*)0) ; n = medh->agenda+(n->parent)) { | |||
59 | if (n->in == 0 && n->out == 0) | |||
60 | break; | |||
61 | if (n->parent == -1) | |||
62 | break; | |||
63 | int_stack_push(n->in); | |||
64 | } | |||
65 | printptr = 0; | |||
66 | if (medh->outstring_length < 2*wordlen) { | |||
67 | medh->outstring_length *= 2; | |||
68 | medh->outstring = realloc(medh->outstring, medh->outstring_length*sizeof(char)); | |||
69 | } | |||
70 | while (!(int_stack_isempty())) { | |||
71 | sym = int_stack_pop(); | |||
72 | if (sym > 2) { | |||
73 | printptr += sprintf(medh->outstring+printptr,"%s", print_sym(sym, sigma)); | |||
74 | } | |||
75 | if (sym == 0) { | |||
76 | if (medh->align_symbol) { | |||
77 | printptr += sprintf(medh->outstring+printptr,"%s",medh->align_symbol); | |||
78 | } | |||
79 | } | |||
80 | if (sym == 2) { | |||
81 | printptr += sprintf(medh->outstring+printptr,"@"); | |||
82 | } | |||
83 | } | |||
84 | for (n = node; n != NULL((void*)0) ; n = medh->agenda+(n->parent)) { | |||
85 | if (n->in == 0 && n->out == 0) | |||
86 | break; | |||
87 | if (n->parent == -1) | |||
88 | break; | |||
89 | else | |||
90 | int_stack_push(n->out); | |||
91 | } | |||
92 | printptr = 0; | |||
93 | if (medh->instring_length < 2*wordlen) { | |||
94 | medh->instring_length *= 2; | |||
95 | medh->instring = realloc(medh->instring, medh->instring_length*sizeof(char)); | |||
96 | } | |||
97 | for (i = 0; !(int_stack_isempty()); ) { | |||
98 | sym = int_stack_pop(); | |||
99 | if (sym > 2) { | |||
100 | printptr += sprintf(medh->instring+printptr,"%s", print_sym(sym, sigma)); | |||
101 | i += utf8skip(word+i)+1; | |||
102 | } | |||
103 | if (sym == 0) { | |||
104 | if (medh->align_symbol) { | |||
105 | printptr += sprintf(medh->instring+printptr,"%s",medh->align_symbol); | |||
106 | } | |||
107 | } | |||
108 | if (sym == 2) { | |||
109 | if (i > wordlen) { | |||
110 | printptr += sprintf(medh->instring+printptr,"*"); | |||
111 | } else { | |||
112 | //printf("%.*s", utf8skip(word+i)+1, word+i); | |||
113 | printptr += sprintf(medh->instring+printptr,"%.*s", utf8skip(word+i)+1, word+i); | |||
114 | i+= utf8skip(word+i)+1; | |||
115 | } | |||
116 | } | |||
117 | } | |||
118 | medh->cost = node->g; | |||
119 | // printf("Cost[f]: %i\n\n", node->g); | |||
120 | } | |||
121 | ||||
122 | void apply_med_clear(struct apply_med_handle *medh) { | |||
123 | if (medh == NULL((void*)0)) | |||
124 | return; | |||
125 | if (medh->agenda != NULL((void*)0)) | |||
126 | free(medh->agenda); | |||
127 | if (medh->instring != NULL((void*)0)) | |||
128 | free(medh->instring); | |||
129 | if (medh->outstring != NULL((void*)0)) | |||
130 | free(medh->outstring); | |||
131 | if (medh->heap != NULL((void*)0)) | |||
132 | free(medh->heap); | |||
133 | if (medh->state_array != NULL((void*)0)) | |||
134 | free(medh->state_array); | |||
135 | if (medh->align_symbol != NULL((void*)0)) | |||
136 | free(medh->align_symbol); | |||
137 | if (medh->letterbits != NULL((void*)0)) | |||
138 | free(medh->letterbits); | |||
139 | if (medh->nletterbits != NULL((void*)0)) | |||
140 | free(medh->nletterbits); | |||
141 | if (medh->intword != NULL((void*)0)) | |||
142 | free(medh->intword); | |||
143 | if (medh->sigmahash != NULL((void*)0)) | |||
144 | sh_done(medh->sigmahash); | |||
145 | free(medh); | |||
146 | } | |||
147 | ||||
148 | struct apply_med_handle *apply_med_init(struct fsm *net) { | |||
149 | ||||
150 | struct apply_med_handle *medh; | |||
151 | struct sigma *sigma; | |||
152 | medh = calloc(1,sizeof(struct apply_med_handle)); | |||
153 | medh->net = net; | |||
154 | medh->agenda = malloc(sizeof(struct astarnode)*INITIAL_AGENDA_SIZE256); | |||
155 | medh->agenda->f = -1; | |||
156 | medh->agenda_size = INITIAL_AGENDA_SIZE256; | |||
157 | ||||
158 | medh->heap = malloc(sizeof(int)*INITIAL_HEAP_SIZE256); | |||
159 | medh->heap_size = INITIAL_HEAP_SIZE256; | |||
160 | *(medh->heap) = 0; /* Points to sentinel */ | |||
161 | medh->astarcount = 1; | |||
162 | medh->heapcount = 0; | |||
163 | medh->state_array = map_firstlines(net); | |||
164 | if (net->medlookup != NULL((void*)0) && net->medlookup->confusion_matrix != NULL((void*)0)) { | |||
165 | medh->hascm = 1; | |||
166 | medh->cm = net->medlookup->confusion_matrix; | |||
167 | } | |||
168 | medh->maxsigma = sigma_max(net->sigma)+1; | |||
169 | medh->sigmahash = sh_init(); | |||
170 | for (sigma = net->sigma; sigma != NULL((void*)0) && sigma->number != -1 ; sigma=sigma->next ) { | |||
171 | if (sigma->number > IDENTITY2) { | |||
172 | sh_add_string(medh->sigmahash, sigma->symbol, sigma->number); | |||
173 | } | |||
174 | } | |||
175 | ||||
176 | ||||
177 | fsm_create_letter_lookup(medh, net); | |||
178 | ||||
179 | medh->instring = malloc(sizeof(char)*INITIAL_STRING_SIZE256); | |||
180 | medh->instring_length = INITIAL_STRING_SIZE256; | |||
181 | medh->outstring = malloc(sizeof(char)*INITIAL_STRING_SIZE256); | |||
182 | medh->outstring_length = INITIAL_STRING_SIZE256; | |||
183 | ||||
184 | medh->med_limit = MED_DEFAULT_LIMIT4; | |||
185 | medh->med_cutoff = MED_DEFAULT_CUTOFF15; | |||
186 | medh->med_max_heap_size = MED_DEFAULT_MAX_HEAP_SIZE262145; | |||
187 | return(medh); | |||
188 | } | |||
189 | ||||
190 | void apply_med_set_heap_max(struct apply_med_handle *medh, int max) { | |||
191 | if (medh != NULL((void*)0)) { | |||
192 | medh->med_max_heap_size = max; | |||
193 | } | |||
194 | } | |||
195 | ||||
196 | void apply_med_set_align_symbol(struct apply_med_handle *medh, char *align) { | |||
197 | if (medh != NULL((void*)0)) { | |||
198 | medh->align_symbol = strdup(align); | |||
199 | } | |||
200 | } | |||
201 | ||||
202 | void apply_med_set_med_limit(struct apply_med_handle *medh, int max) { | |||
203 | if (medh != NULL((void*)0)) { | |||
204 | medh->med_limit = max; | |||
205 | } | |||
206 | } | |||
207 | ||||
208 | void apply_med_set_med_cutoff(struct apply_med_handle *medh, int max) { | |||
209 | if (medh != NULL((void*)0)) { | |||
210 | medh->med_cutoff = max; | |||
211 | } | |||
212 | } | |||
213 | ||||
214 | int apply_med_get_cost(struct apply_med_handle *medh) { | |||
215 | return(medh->cost); | |||
216 | } | |||
217 | ||||
218 | char *apply_med_get_instring(struct apply_med_handle *medh) { | |||
219 | return(medh->instring); | |||
220 | } | |||
221 | ||||
222 | char *apply_med_get_outstring(struct apply_med_handle *medh) { | |||
223 | return(medh->outstring); | |||
224 | } | |||
225 | ||||
226 | char *apply_med(struct apply_med_handle *medh, char *word) { | |||
227 | ||||
228 | /* local ok: i, j, target, in, out, g, h, curr_node */ | |||
229 | /* not ok: curr_ptr, curr_pos, lines, nummatches, nodes_expanded, curr_state */ | |||
230 | ||||
231 | int i, j, target, in, out, g, h, thisskip; | |||
232 | ||||
233 | int delcost, subscost, inscost; | |||
234 | char temputf[5] ; | |||
235 | ||||
236 | struct astarnode *curr_node; | |||
237 | ||||
238 | delcost = subscost = inscost = 1; | |||
239 | ||||
240 | ||||
241 | if (word == NULL((void*)0)) { | |||
| ||||
242 | goto resume; | |||
243 | } | |||
244 | ||||
245 | medh->word = word; | |||
246 | ||||
247 | medh->nodes_expanded = 0; | |||
248 | medh->astarcount = 1; | |||
249 | medh->heapcount = 0; | |||
250 | ||||
251 | medh->wordlen = strlen(word); | |||
252 | medh->utf8len = utf8strlen(word); | |||
253 | if (medh->intword != NULL((void*)0)) { | |||
254 | free(medh->intword); | |||
255 | } | |||
256 | medh->intword = malloc(sizeof(int)*(medh->utf8len+1)); | |||
257 | ||||
258 | /* intword -> sigma numbers of word */ | |||
259 | for (i=0, j=0; i < medh->wordlen; i += thisskip, j++) { | |||
260 | thisskip = utf8skip(word+i)+1; | |||
261 | strncpy(temputf, word+i, thisskip); | |||
262 | temputf[thisskip] = '\0'; | |||
263 | if (sh_find_string(medh->sigmahash, temputf) != NULL((void*)0)) { | |||
264 | *(medh->intword+j) = sh_get_value(medh->sigmahash); | |||
265 | } else { | |||
266 | *(medh->intword+j) = IDENTITY2; | |||
267 | } | |||
268 | } | |||
269 | ||||
270 | ||||
271 | ||||
272 | *(medh->intword+j) = -1; /* sentinel */ | |||
273 | ||||
274 | /* Insert (0,0) g = 0 */ | |||
275 | ||||
276 | h = calculate_h(medh, medh->intword, 0, 0); | |||
277 | ||||
278 | /* Root node */ | |||
279 | ||||
280 | if (!node_insert(medh,0,0,0,h,0,0,-1)) { goto out; } | |||
281 | medh->nummatches = 0; | |||
282 | ||||
283 | for(;;) { | |||
284 | ||||
285 | curr_node = node_delete_min(medh); | |||
286 | /* Need to save this in case we realloc and print_match() */ | |||
287 | medh->curr_agenda_offset = curr_node-medh->agenda; | |||
288 | if (curr_node
| |||
289 | //printf("Reached cutoff of %i.\n", medh->med_cutoff); | |||
290 | goto out; | |||
291 | } | |||
292 | medh->curr_state = curr_node->fsmstate; | |||
293 | medh->curr_ptr = (medh->state_array+medh->curr_state)->transitions; | |||
294 | if (!medh->curr_ptr->final_state || !(curr_node->wordpos == medh->utf8len)) { | |||
295 | //continue; | |||
296 | } | |||
297 | ||||
298 | medh->nodes_expanded++; | |||
299 | ||||
300 | if (curr_node->f > medh->med_cutoff) { | |||
301 | //printf("Reached cutoff of %i\n", medh->med_cutoff); | |||
302 | goto out; | |||
303 | } | |||
304 | ||||
305 | medh->curr_pos = curr_node->wordpos; | |||
306 | medh->curr_state = curr_node->fsmstate; | |||
307 | medh->curr_g = curr_node->g; | |||
308 | ||||
309 | medh->lines = 0; | |||
310 | medh->curr_node_has_match = 0; | |||
311 | ||||
312 | for (medh->curr_ptr = (medh->state_array+medh->curr_state)->transitions ; ;) { | |||
313 | if (medh->curr_ptr->state_no == -1) { | |||
314 | break; | |||
315 | } | |||
316 | medh->lines++; | |||
317 | if (medh->curr_ptr->final_state
| |||
318 | if (medh->curr_node_has_match == 0) { | |||
319 | /* Found a match */ | |||
320 | medh->curr_node_has_match = 1; | |||
321 | print_match(medh, medh->agenda+medh->curr_agenda_offset, medh->net->sigma, medh->word); | |||
322 | medh->nummatches++; | |||
323 | return(medh->outstring); | |||
324 | } | |||
325 | } | |||
326 | ||||
327 | resume: | |||
328 | ||||
329 | if (medh->nummatches == medh->med_limit) { | |||
330 | goto out; | |||
331 | } | |||
332 | ||||
333 | if (medh->curr_ptr->target == -1 && medh->curr_pos == medh->utf8len) | |||
334 | break; | |||
335 | if (medh->curr_ptr->target == -1 && medh->lines
| |||
336 | goto insert; | |||
337 | if (medh->curr_ptr->target == -1) | |||
338 | break; | |||
339 | ||||
340 | target = medh->curr_ptr->target; | |||
341 | /* Add nodes to edge:0, edge:input, 0:edge */ | |||
342 | ||||
343 | /* Delete a symbol from input */ | |||
344 | in = medh->curr_ptr->in; | |||
345 | out = 0; | |||
346 | g = medh->hascm ? medh->curr_g + *(medh->cm+in*medh->maxsigma) : medh->curr_g + delcost; | |||
347 | h = calculate_h(medh, medh->intword, medh->curr_pos, medh->curr_ptr->target); | |||
348 | ||||
349 | if ((medh->curr_pos == medh->utf8len) && (medh->curr_ptr->final_state == 0) && (h == 0)) { | |||
350 | // h = 1; | |||
351 | } | |||
352 | ||||
353 | if (g+h <= medh->med_cutoff) { | |||
354 | if (!node_insert(medh, medh->curr_pos, target, g, h, in, out, medh->curr_agenda_offset)) { | |||
355 | goto out; | |||
356 | } | |||
357 | } | |||
358 | if (medh->curr_pos == medh->utf8len) | |||
359 | goto skip; | |||
360 | ||||
361 | /* Match/substitute */ | |||
362 | in = medh->curr_ptr->in; | |||
363 | out = *(medh->intword+medh->curr_pos); | |||
364 | if (in != out) { | |||
365 | g = medh->hascm ? medh->curr_g + *(medh->cm+in*medh->maxsigma+out) : medh->curr_g + subscost; | |||
366 | } else { | |||
367 | g = medh->curr_g; | |||
368 | } | |||
369 | ||||
370 | h = calculate_h(medh, medh->intword, medh->curr_pos+1, medh->curr_ptr->target); | |||
371 | if ((g+h) <= medh->med_cutoff) { | |||
372 | if (!node_insert(medh,medh->curr_pos+1, target, g, h, in, out, medh->curr_agenda_offset)) { | |||
373 | goto out; | |||
374 | } | |||
375 | } | |||
376 | insert: | |||
377 | /* Insert a symbol into input */ | |||
378 | /* Can only be done once per state */ | |||
379 | ||||
380 | if (medh->lines
| |||
381 | ||||
382 | in = 0; | |||
383 | out = *(medh->intword+medh->curr_pos); | |||
384 | ||||
385 | g = medh->hascm ? medh->curr_g + *(medh->cm+out) : medh->curr_g + inscost; | |||
386 | h = calculate_h(medh, medh->intword, medh->curr_pos+1, medh->curr_state); | |||
387 | ||||
388 | if (g+h <= medh->med_cutoff) | |||
389 | if (!node_insert(medh,medh->curr_pos+1, medh->curr_state, g, h, in, out, medh->curr_agenda_offset)) { | |||
390 | goto out; | |||
391 | } | |||
392 | } | |||
393 | if (medh->curr_ptr->target == -1) | |||
394 | break; | |||
395 | skip: | |||
396 | if ((medh->curr_ptr+1)->state_no == (medh->curr_ptr)->state_no) | |||
397 | medh->curr_ptr++; | |||
398 | else | |||
399 | break; | |||
400 | } | |||
401 | } | |||
402 | out: | |||
403 | return(NULL((void*)0)); | |||
404 | } | |||
405 | ||||
406 | int calculate_h(struct apply_med_handle *medh, int *intword, int currpos, int state) { | |||
407 | int i, j, hinf, hn, curr_sym; | |||
408 | uint8_t *bitptr, *nbitptr; | |||
409 | hinf = 0; | |||
410 | hn = 0; | |||
411 | ||||
412 | bitptr = state*medh->bytes_per_letter_array + medh->letterbits; | |||
413 | nbitptr = state*medh->bytes_per_letter_array + medh->nletterbits; | |||
414 | ||||
415 | /* For n = inf */ | |||
416 | if (*(intword+currpos) == -1) | |||
| ||||
417 | return 0; | |||
418 | for (i = currpos; *(intword+i) != -1; i++) { | |||
419 | curr_sym = *(intword+i); | |||
420 | if (!BITTEST(bitptr, curr_sym)((bitptr)[((curr_sym) >> 3)] & (1 << ((curr_sym ) & 7)))) { | |||
421 | hinf++; | |||
422 | } | |||
423 | } | |||
424 | /* For n = maxdepth */ | |||
425 | if (*(intword+currpos) == -1) | |||
426 | return 0; | |||
427 | for (i = currpos, j = 0; j < medh->maxdepth && *(intword+i) != -1; i++, j++) { | |||
428 | curr_sym = *(intword+i); | |||
429 | if (!BITTEST(nbitptr, curr_sym)((nbitptr)[((curr_sym) >> 3)] & (1 << ((curr_sym ) & 7)))) { | |||
430 | hn++; | |||
431 | } | |||
432 | } | |||
433 | return(hinf > hn ? hinf : hn); | |||
434 | } | |||
435 | ||||
436 | struct astarnode *node_delete_min(struct apply_med_handle *medh) { | |||
437 | int i, child; | |||
438 | struct astarnode *firstptr, *lastptr; | |||
439 | if (medh->heapcount == 0) { | |||
440 | return NULL((void*)0); | |||
441 | } | |||
442 | ||||
443 | /* We find the min from the heap */ | |||
444 | ||||
445 | firstptr = medh->agenda+medh->heap[1]; | |||
446 | lastptr = medh->agenda+medh->heap[medh->heapcount]; | |||
447 | medh->heapcount--; | |||
448 | ||||
449 | /* Adjust heap */ | |||
450 | for (i = 1; (i<<1) <= medh->heapcount; i = child) { | |||
451 | child = i<<1; | |||
452 | ||||
453 | /* If right child is smaller (higher priority) than left child */ | |||
454 | if (child != medh->heapcount && | |||
455 | ((medh->agenda+medh->heap[child+1])->f < (medh->agenda+medh->heap[child])->f || | |||
456 | ((medh->agenda+medh->heap[child+1])->f <= (medh->agenda+medh->heap[child])->f && | |||
457 | (medh->agenda+medh->heap[child+1])->wordpos > (medh->agenda+medh->heap[child])->wordpos))) { | |||
458 | child++; | |||
459 | } | |||
460 | ||||
461 | /* If child has lower priority than last element */ | |||
462 | if ((medh->agenda+medh->heap[child])->f < lastptr->f || | |||
463 | ((medh->agenda+medh->heap[child])->f <= lastptr->f && | |||
464 | (medh->agenda+medh->heap[child])->wordpos > lastptr->wordpos)) { | |||
465 | ||||
466 | medh->heap[i] = medh->heap[child]; | |||
467 | } else { | |||
468 | break; | |||
469 | } | |||
470 | } | |||
471 | medh->heap[i] = (lastptr-medh->agenda); | |||
472 | return(firstptr); | |||
473 | } | |||
474 | ||||
475 | int node_insert(struct apply_med_handle *medh, int wordpos, int fsmstate, int g, int h, int in, int out, int parent) { | |||
476 | int i,j,f; | |||
477 | /* We add the node in the array */ | |||
478 | i = medh->astarcount; | |||
479 | if (i >= (medh->agenda_size-1)) { | |||
480 | if (medh->agenda_size*2 >= medh->med_max_heap_size) { | |||
481 | //printf("heap limit reached by %i\n", medh->med_max_heap_size); | |||
482 | return 0; | |||
483 | } | |||
484 | medh->agenda_size *= 2; | |||
485 | medh->agenda = realloc(medh->agenda, sizeof(struct astarnode)*medh->agenda_size); | |||
486 | } | |||
487 | f = g + h; | |||
488 | (medh->agenda+i)->wordpos = wordpos; | |||
489 | (medh->agenda+i)->fsmstate = fsmstate; | |||
490 | (medh->agenda+i)->f = f; | |||
491 | (medh->agenda+i)->g = g; | |||
492 | (medh->agenda+i)->h = h; | |||
493 | (medh->agenda+i)->in = in; | |||
494 | (medh->agenda+i)->out = out; | |||
495 | (medh->agenda+i)->parent = parent; | |||
496 | medh->astarcount++; | |||
497 | ||||
498 | /* We also put the ptr on the heap */ | |||
499 | medh->heapcount++; | |||
500 | ||||
501 | if (medh->heapcount == medh->heap_size-1) { | |||
502 | medh->heap = realloc(medh->heap, sizeof(int)*medh->heap_size*2); | |||
503 | medh->heap_size *= 2; | |||
504 | } | |||
505 | /* >= makes fifo */ | |||
506 | // for (j = heapcount; (agenda+heap[j/2])->f > f; j /= 2) { | |||
507 | for (j = medh->heapcount; ( (medh->agenda+medh->heap[j>>1])->f > f) || ((medh->agenda+medh->heap[j>>1])->f >= f && (medh->agenda+medh->heap[j>>1])->wordpos <= wordpos) ; j = j >> 1) { | |||
508 | medh->heap[j] = medh->heap[j>>1]; | |||
509 | } | |||
510 | medh->heap[j] = i; | |||
511 | return 1; | |||
512 | } | |||
513 | ||||
514 | ||||
515 | void letterbits_union(int v, int vp, uint8_t *ptr, int bytes_per_letter_array) { | |||
516 | int i; | |||
517 | uint8_t *vptr, *vpptr; | |||
518 | vptr = ptr+(v*bytes_per_letter_array); | |||
519 | vpptr = ptr+(vp*bytes_per_letter_array); | |||
520 | for (i=0; i < bytes_per_letter_array; i++) { | |||
521 | *(vptr+i) = *(vptr+i)|*(vpptr+i); | |||
522 | } | |||
523 | } | |||
524 | ||||
525 | void letterbits_copy(int source, int target, uint8_t *ptr, int bytes_per_letter_array) { | |||
526 | int i; | |||
527 | uint8_t *sourceptr, *targetptr; | |||
528 | sourceptr = ptr+(source*bytes_per_letter_array); | |||
529 | targetptr = ptr+(target*bytes_per_letter_array); | |||
530 | for (i=0; i < bytes_per_letter_array; i++) { | |||
531 | *(targetptr+i) = *(sourceptr+i); | |||
532 | } | |||
533 | } | |||
534 | ||||
535 | void letterbits_add(int v, int symbol, uint8_t *ptr, int bytes_per_letter_array) { | |||
536 | uint8_t *vptr; | |||
537 | vptr = ptr+(v*bytes_per_letter_array); | |||
538 | BITSET(vptr, symbol)((vptr)[((symbol) >> 3)] |= (1 << ((symbol) & 7))); | |||
539 | } | |||
540 | ||||
541 | /* Creates, for each state, a list of symbols that can be matched */ | |||
542 | /* somewhere in subsequent paths (the case n = inf) */ | |||
543 | /* This is needed for h() of A*-search in approximate matching */ | |||
544 | /* This is done by a DFS where each state gets the properties */ | |||
545 | /* of the arcs it has as well as a copy of the properties of the target state */ | |||
546 | /* At the same time we keep track of the strongly connected components */ | |||
547 | /* And copy the properties from each root of the SCC to the children */ | |||
548 | /* The SCC part is required for the algorithm to work with cyclic graphs */ | |||
549 | /* This is basically a modification of Tarjan's (1972) SCC algorithm */ | |||
550 | /* However, it's converted from recursive form to iterative form using */ | |||
551 | /* goto statements. Here's the original recursive specification: */ | |||
552 | ||||
553 | /* Input: FSM = (V,E) */ | |||
554 | /* Output: bitvector v.letters for each state */ | |||
555 | /* index = 1 DFS node number counter */ | |||
556 | ||||
557 | /* procedure Mark(v) */ | |||
558 | /* v.index = index Set the depth index for v */ | |||
559 | /* v.lowlink = index */ | |||
560 | /* index++ */ | |||
561 | /* push(v) Push v on the stack */ | |||
562 | /* forall edge = (v, v') in E do Consider successors of v */ | |||
563 | /* v.letters |= v'.letters | edge Copy target v'.letters */ | |||
564 | /* if (v'.index == 0) Was successor v' visited? */ | |||
565 | /* Mark(v') Recurse */ | |||
566 | /* v.lowlink = min(v.lowlink, v'.lowlink) */ | |||
567 | /* elseif (v' is on stack) Was successor v' in stack S? */ | |||
568 | /* v.lowlink = min(v.lowlink, v'.index) */ | |||
569 | /* if (v.lowlink == v.index) Is v the root of a SCC? */ | |||
570 | /* do */ | |||
571 | /* pop(v') */ | |||
572 | /* v'.letters = v.letters */ | |||
573 | /* while (v' != v) */ | |||
574 | ||||
575 | /* For keeping track of the strongly connected components */ | |||
576 | /* when doing the DFS */ | |||
577 | ||||
578 | struct sccinfo { | |||
579 | int index; | |||
580 | int lowlink; | |||
581 | int on_t_stack; | |||
582 | }; | |||
583 | ||||
584 | void fsm_create_letter_lookup(struct apply_med_handle *medh, struct fsm *net) { | |||
585 | ||||
586 | int num_states, num_symbols, index, v, vp, copystate, i, j; | |||
587 | struct fsm_state *curr_ptr; | |||
588 | struct sccinfo *sccinfo; | |||
589 | int depth; | |||
590 | medh->maxdepth = 2; | |||
591 | ||||
592 | num_states = net->statecount; | |||
593 | num_symbols = sigma_max(net->sigma); | |||
594 | ||||
595 | medh->bytes_per_letter_array = BITNSLOTS(num_symbols+1)((num_symbols+1 + 8 - 1) / 8); | |||
596 | medh->letterbits = calloc(medh->bytes_per_letter_array*num_states,sizeof(uint8_t)); | |||
597 | medh->nletterbits = calloc(medh->bytes_per_letter_array*num_states,sizeof(uint8_t)); | |||
598 | ||||
599 | sccinfo = calloc(num_states,sizeof(struct sccinfo)); | |||
600 | ||||
601 | index = 1; | |||
602 | curr_ptr = net->states; | |||
603 | goto l1; | |||
604 | ||||
605 | /* Here we go again, converting a recursive algorithm to an iterative one */ | |||
606 | /* by gotos */ | |||
607 | ||||
608 | while(!ptr_stack_isempty()) { | |||
609 | ||||
610 | curr_ptr = ptr_stack_pop(); | |||
611 | ||||
612 | v = curr_ptr->state_no; /* source state number */ | |||
613 | vp = curr_ptr->target; /* target state number */ | |||
614 | ||||
615 | /* T: v.letterlist = list_union(v'->list, current edge label) */ | |||
616 | ||||
617 | letterbits_union(v, vp, medh->letterbits,medh->bytes_per_letter_array); /* add v' target bits to v */ | |||
618 | letterbits_add(v, curr_ptr->in, medh->letterbits,medh->bytes_per_letter_array); /* add current arc label to v */ | |||
619 | ||||
620 | (sccinfo+v)->lowlink = min_((sccinfo+v)->lowlink,(sccinfo+vp)->lowlink)(((sccinfo+v)->lowlink) < ((sccinfo+vp)->lowlink) ? ( (sccinfo+v)->lowlink) : ((sccinfo+vp)->lowlink)); | |||
621 | ||||
622 | if ((curr_ptr+1)->state_no != curr_ptr->state_no) { | |||
623 | goto l4; | |||
624 | } else { | |||
625 | goto l3; | |||
626 | } | |||
627 | ||||
628 | l1: | |||
629 | v = curr_ptr->state_no; | |||
630 | vp = curr_ptr->target; /* target */ | |||
631 | /* T: v.lowlink = index, index++, Tpush(v) */ | |||
632 | (sccinfo+v)->index = index; | |||
633 | (sccinfo+v)->lowlink = index; | |||
634 | index++; | |||
635 | int_stack_push(v); | |||
636 | (sccinfo+v)->on_t_stack = 1; | |||
637 | /* if v' not visited (is v'.index set) */ | |||
638 | ||||
639 | /* If done here, check lowlink, pop */ | |||
640 | ||||
641 | if (vp == -1) | |||
642 | goto l4; | |||
643 | l2: | |||
644 | letterbits_add(v, curr_ptr->in, medh->letterbits,medh->bytes_per_letter_array); | |||
645 | if ((sccinfo+vp)->index == 0) { | |||
646 | /* push (v,e) ptr on stack */ | |||
647 | ptr_stack_push(curr_ptr); | |||
648 | curr_ptr = (medh->state_array+(curr_ptr->target))->transitions; | |||
649 | /* (v,e) = (v',firstedge), goto init */ | |||
650 | goto l1; | |||
651 | /* if v' visited */ | |||
652 | /* T: v.lowlink = min(v.lowlink, v'.lowlink), union v.list with e, move to next edge in v, goto loop */ | |||
653 | } else if ((sccinfo+vp)->on_t_stack) { | |||
654 | (sccinfo+v)->lowlink = min_((sccinfo+v)->lowlink,(sccinfo+vp)->lowlink)(((sccinfo+v)->lowlink) < ((sccinfo+vp)->lowlink) ? ( (sccinfo+v)->lowlink) : ((sccinfo+vp)->lowlink)); | |||
655 | } | |||
656 | /* If node is visited, copy its bits */ | |||
657 | letterbits_union(v,vp,medh->letterbits,medh->bytes_per_letter_array); | |||
658 | ||||
659 | l3: | |||
660 | if ((curr_ptr+1)->state_no == curr_ptr->state_no) { | |||
661 | curr_ptr++; | |||
662 | v = curr_ptr->state_no; | |||
663 | vp = curr_ptr->target; /* target */ | |||
664 | goto l2; | |||
665 | } | |||
666 | ||||
667 | ||||
668 | /* T: if lastedge, v.lowlink == v.index, pop Tstack until v is popped and copy v.list to others */ | |||
669 | /* Copy all bits from root of SCC to descendants */ | |||
670 | l4: | |||
671 | if ((sccinfo+v)->lowlink == (sccinfo+v)->index) { | |||
672 | //printf("\nSCC: [%i] ",v); | |||
673 | while((copystate = int_stack_pop()) != v) { | |||
674 | (sccinfo+copystate)->on_t_stack = 0; | |||
675 | letterbits_copy(v, copystate, medh->letterbits, medh->bytes_per_letter_array); | |||
676 | //printf("%i ", copystate); | |||
677 | } | |||
678 | (sccinfo+v)->on_t_stack = 0; | |||
679 | //printf("\n"); | |||
680 | } | |||
681 | } | |||
682 | ||||
683 | for (i=0; i < num_states; i++) { | |||
684 | //printf("State %i: ",i); | |||
685 | for (j=0; j <= num_symbols; j++) { | |||
686 | if (BITTEST(medh->letterbits+(i*medh->bytes_per_letter_array),j)((medh->letterbits+(i*medh->bytes_per_letter_array))[(( j) >> 3)] & (1 << ((j) & 7)))) { | |||
687 | //printf("[%i]",j); | |||
688 | } | |||
689 | } | |||
690 | //printf("\n"); | |||
691 | } | |||
692 | int_stack_clear(); | |||
693 | ||||
694 | /* We do the same thing for some finite n (up to maxdepth) */ | |||
695 | /* and store the result in nletterbits */ | |||
696 | ||||
697 | for (v=0; v < num_states; v++) { | |||
698 | ptr_stack_push((medh->state_array+v)->transitions); | |||
699 | int_stack_push(0); | |||
700 | while (!ptr_stack_isempty()) { | |||
701 | curr_ptr = ptr_stack_pop(); | |||
702 | depth = int_stack_pop(); | |||
703 | looper: | |||
704 | if (depth == medh->maxdepth) | |||
705 | continue; | |||
706 | if (curr_ptr->in != -1) { | |||
707 | letterbits_add(v, curr_ptr->in, medh->nletterbits, medh->bytes_per_letter_array); | |||
708 | } | |||
709 | if (curr_ptr->target != -1) { | |||
710 | if (curr_ptr->state_no == (curr_ptr+1)->state_no) { | |||
711 | ptr_stack_push(curr_ptr+1); | |||
712 | int_stack_push(depth); | |||
713 | } | |||
714 | depth++; | |||
715 | curr_ptr = (medh->state_array+(curr_ptr->target))->transitions; | |||
716 | goto looper; | |||
717 | } | |||
718 | } | |||
719 | } | |||
720 | for (i=0; i < num_states; i++) { | |||
721 | //printf("State %i: ",i); | |||
722 | for (j=0; j <= num_symbols; j++) { | |||
723 | if (BITTEST(medh->nletterbits+(i*medh->bytes_per_letter_array),j)((medh->nletterbits+(i*medh->bytes_per_letter_array))[( (j) >> 3)] & (1 << ((j) & 7)))) { | |||
724 | //printf("[%i]",j); | |||
725 | } | |||
726 | } | |||
727 | //printf("\n"); | |||
728 | } | |||
729 | free(sccinfo); | |||
730 | } | |||
731 | ||||
732 | void cmatrix_print_att(struct fsm *net, FILE *outfile) { | |||
733 | int i, j, *cm, maxsigma; | |||
734 | maxsigma = sigma_max(net->sigma) + 1; | |||
735 | cm = net->medlookup->confusion_matrix; | |||
736 | ||||
737 | ||||
738 | for (i = 0; i < maxsigma ; i++) { | |||
739 | for (j = 0; j < maxsigma ; j++) { | |||
740 | if ((i != 0 && i < 3) || (j != 0 && j < 3)) { continue; } | |||
741 | if (i == 0 && j != 0) { | |||
742 | fprintf(outfile,"0\t0\t%s\t%s\t%i\n", "@0@", sigma_string(j, net->sigma), *(cm+i*maxsigma+j)); | |||
743 | } else if (j == 0 && i != 0) { | |||
744 | fprintf(outfile,"0\t0\t%s\t%s\t%i\n", sigma_string(i,net->sigma), "@0@", *(cm+i*maxsigma+j)); | |||
745 | } else if (j != 0 && i != 0) { | |||
746 | fprintf(outfile,"0\t0\t%s\t%s\t%i\n", sigma_string(i,net->sigma),sigma_string(j, net->sigma), *(cm+i*maxsigma+j)); | |||
747 | } | |||
748 | } | |||
749 | } | |||
750 | fprintf(outfile,"0\n"); | |||
751 | } | |||
752 | ||||
753 | void cmatrix_print(struct fsm *net) { | |||
754 | int lsymbol, i, j, *cm, maxsigma; | |||
755 | char *thisstring; | |||
756 | struct sigma *sigma; | |||
757 | maxsigma = sigma_max(net->sigma) + 1; | |||
758 | cm = net->medlookup->confusion_matrix; | |||
759 | ||||
760 | lsymbol = 0 ; | |||
761 | for (sigma = net->sigma; sigma != NULL((void*)0); sigma = sigma->next) { | |||
762 | if (sigma->number < 3) | |||
763 | continue; | |||
764 | lsymbol = strlen(sigma->symbol) > lsymbol ? strlen(sigma->symbol) : lsymbol; | |||
765 | } | |||
766 | printf("%*s",lsymbol+2,""); | |||
767 | printf("%s","0 "); | |||
768 | ||||
769 | for (i = 3; ; i++) { | |||
770 | if ((thisstring = sigma_string(i, net->sigma)) != NULL((void*)0)) { | |||
771 | printf("%s ",thisstring); | |||
772 | } else { | |||
773 | break; | |||
774 | } | |||
775 | } | |||
776 | ||||
777 | printf("\n"); | |||
778 | ||||
779 | for (i = 0; i < maxsigma ; i++) { | |||
780 | for (j = 0; j < maxsigma ; j++) { | |||
781 | if (j == 0) { | |||
782 | if (i == 0) { | |||
783 | printf("%*s",lsymbol+1,"0"); | |||
784 | printf("%*s",2,"*"); | |||
785 | } else { | |||
786 | printf("%*s",lsymbol+1, sigma_string(i, net->sigma)); | |||
787 | printf("%*d",2,*(cm+i*maxsigma+j)); | |||
788 | } | |||
789 | j++; | |||
790 | j++; | |||
791 | continue; | |||
792 | } | |||
793 | if (i == j) { | |||
794 | printf("%.*s",(int)strlen(sigma_string(j, net->sigma))+1,"*"); | |||
795 | } else { | |||
796 | printf("%.*d",(int)strlen(sigma_string(j, net->sigma))+1,*(cm+i*maxsigma+j)); | |||
797 | } | |||
798 | } | |||
799 | printf("\n"); | |||
800 | if (i == 0) { | |||
801 | i++; i++; | |||
802 | } | |||
803 | } | |||
804 | } | |||
805 | ||||
806 | void cmatrix_init(struct fsm *net) { | |||
807 | int maxsigma, *cm, i, j; | |||
808 | if (net->medlookup == NULL((void*)0)) { | |||
809 | net->medlookup = calloc(1,sizeof(struct medlookup)); | |||
810 | } | |||
811 | maxsigma = sigma_max(net->sigma)+1; | |||
812 | cm = calloc(maxsigma*maxsigma, sizeof(int)); | |||
813 | net->medlookup->confusion_matrix = cm; | |||
814 | for (i = 0; i < maxsigma; i++) { | |||
815 | for (j = 0; j < maxsigma; j++) { | |||
816 | if (i == j) | |||
817 | *(cm+i*maxsigma+j) = 0; | |||
818 | else | |||
819 | *(cm+i*maxsigma+j) = 1; | |||
820 | } | |||
821 | } | |||
822 | } | |||
823 | ||||
824 | void cmatrix_default_substitute(struct fsm *net, int cost) { | |||
825 | int i, j, maxsigma, *cm; | |||
826 | cm = net->medlookup->confusion_matrix; | |||
827 | maxsigma = sigma_max(net->sigma)+1; | |||
828 | for (i = 1; i < maxsigma; i++) { | |||
829 | for (j = 1; j < maxsigma; j++) { | |||
830 | if (i == j) { | |||
831 | *(cm+i*maxsigma+j) = 0; | |||
832 | } else { | |||
833 | *(cm+i*maxsigma+j) = cost; | |||
834 | } | |||
835 | } | |||
836 | } | |||
837 | } | |||
838 | ||||
839 | void cmatrix_default_insert(struct fsm *net, int cost) { | |||
840 | int i, maxsigma, *cm; | |||
841 | cm = net->medlookup->confusion_matrix; | |||
842 | maxsigma = sigma_max(net->sigma)+1; | |||
843 | for (i = 0; i < maxsigma; i++) { | |||
844 | *(cm+i) = cost; | |||
845 | } | |||
846 | } | |||
847 | ||||
848 | void cmatrix_default_delete(struct fsm *net, int cost) { | |||
849 | int i, maxsigma, *cm; | |||
850 | cm = net->medlookup->confusion_matrix; | |||
851 | maxsigma = sigma_max(net->sigma)+1; | |||
852 | for (i = 0; i < maxsigma; i++) { | |||
853 | *(cm+i*maxsigma) = cost; | |||
854 | } | |||
855 | } | |||
856 | ||||
857 | void cmatrix_set_cost(struct fsm *net, char *in, char *out, int cost) { | |||
858 | int i, o, maxsigma, *cm; | |||
859 | cm = net->medlookup->confusion_matrix; | |||
860 | maxsigma = sigma_max(net->sigma) + 1; | |||
861 | if (in == NULL((void*)0)) { | |||
862 | i = 0; | |||
863 | } else { | |||
864 | i = sigma_find(in, net->sigma); | |||
865 | } | |||
866 | if (out == NULL((void*)0)) { | |||
867 | o = 0; | |||
868 | } else { | |||
869 | o = sigma_find(out, net->sigma); | |||
870 | } | |||
871 | if (i == -1) { | |||
872 | printf("Warning, symbol '%s' not in alphabet\n",in); | |||
873 | return; | |||
874 | } | |||
875 | if (o == -1) { | |||
876 | printf("Warning, symbol '%s' not in alphabet\n",out); | |||
877 | return; | |||
878 | } | |||
879 | *(cm+i*maxsigma+o) = cost; | |||
880 | } |