| File: | spelling.c |
| Warning: | line 416, column 28 The left operand of '==' is a garbage value |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /* Foma: a finite-state toolkit and library. */ | |||
| 2 | /* Copyright © 2008-2021 Mans Hulden */ | |||
| 3 | ||||
| 4 | /* This file is part of foma. */ | |||
| 5 | ||||
| 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ | |||
| 7 | /* you may not use this file except in compliance with the License. */ | |||
| 8 | /* You may obtain a copy of the License at */ | |||
| 9 | ||||
| 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ | |||
| 11 | ||||
| 12 | /* Unless required by applicable law or agreed to in writing, software */ | |||
| 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ | |||
| 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ | |||
| 15 | /* See the License for the specific language governing permissions and */ | |||
| 16 | /* limitations under the License. */ | |||
| 17 | ||||
| 18 | #include <stdio.h> | |||
| 19 | #include <stdlib.h> | |||
| 20 | #include <string.h> | |||
| 21 | #include <limits.h> | |||
| 22 | #include "foma.h" | |||
| 23 | ||||
| 24 | #define INITIAL_AGENDA_SIZE256 256 | |||
| 25 | #define INITIAL_HEAP_SIZE256 256 | |||
| 26 | #define INITIAL_STRING_SIZE256 256 | |||
| 27 | #define MED_DEFAULT_LIMIT4 4 /* Default max words to find */ | |||
| 28 | #define MED_DEFAULT_CUTOFF15 15 /* Default MED cost cutoff */ | |||
| 29 | #define MED_DEFAULT_MAX_HEAP_SIZE262145 262145 /* By default won't grow heap more than this */ | |||
| 30 | ||||
| 31 | #define BITMASK(b)(1 << ((b) & 7)) (1 << ((b) & 7)) | |||
| 32 | #define BITSLOT(b)((b) >> 3) ((b) >> 3) | |||
| 33 | #define BITSET(a,b)((a)[((b) >> 3)] |= (1 << ((b) & 7))) ((a)[BITSLOT(b)((b) >> 3)] |= BITMASK(b)(1 << ((b) & 7))) | |||
| 34 | #define BITCLEAR(a,b)((a)[((b) >> 3)] &= ~(1 << ((b) & 7))) ((a)[BITSLOT(b)((b) >> 3)] &= ~BITMASK(b)(1 << ((b) & 7))) | |||
| 35 | #define BITTEST(a,b)((a)[((b) >> 3)] & (1 << ((b) & 7))) ((a)[BITSLOT(b)((b) >> 3)] & BITMASK(b)(1 << ((b) & 7))) | |||
| 36 | #define BITNSLOTS(nb)((nb + 8 - 1) / 8) ((nb + CHAR_BIT8 - 1) / CHAR_BIT8) | |||
| 37 | #define min_(X, Y)((X) < (Y) ? (X) : (Y)) ((X) < (Y) ? (X) : (Y)) | |||
| 38 | ||||
| 39 | static int calculate_h(struct apply_med_handle *medh, int *intword, int currpos, int state); | |||
| 40 | static struct astarnode *node_delete_min(struct apply_med_handle *medh); | |||
| 41 | int node_insert(struct apply_med_handle *medh, int wordpos, int fsmstate, int g, int h, int in, int out, int parent); | |||
| 42 | ||||
| 43 | char *print_sym(int sym, struct sigma *sigma) { | |||
| 44 | while (sigma != NULL((void*)0)) { | |||
| 45 | if (sigma->number == sym) { | |||
| 46 | return(sigma->symbol); | |||
| 47 | } | |||
| 48 | sigma = sigma->next; | |||
| 49 | } | |||
| 50 | return NULL((void*)0); | |||
| 51 | } | |||
| 52 | ||||
| 53 | void print_match(struct apply_med_handle *medh, struct astarnode *node, struct sigma *sigma, char *word) { | |||
| 54 | int sym, i, wordlen , printptr; | |||
| 55 | struct astarnode *n; | |||
| 56 | int_stack_clear(); | |||
| 57 | wordlen = medh->wordlen; | |||
| 58 | for (n = node; n != NULL((void*)0) ; n = medh->agenda+(n->parent)) { | |||
| 59 | if (n->in == 0 && n->out == 0) | |||
| 60 | break; | |||
| 61 | if (n->parent == -1) | |||
| 62 | break; | |||
| 63 | int_stack_push(n->in); | |||
| 64 | } | |||
| 65 | printptr = 0; | |||
| 66 | if (medh->outstring_length < 2*wordlen) { | |||
| 67 | medh->outstring_length *= 2; | |||
| 68 | medh->outstring = realloc(medh->outstring, medh->outstring_length*sizeof(char)); | |||
| 69 | } | |||
| 70 | while (!(int_stack_isempty())) { | |||
| 71 | sym = int_stack_pop(); | |||
| 72 | if (sym > 2) { | |||
| 73 | printptr += sprintf(medh->outstring+printptr,"%s", print_sym(sym, sigma)); | |||
| 74 | } | |||
| 75 | if (sym == 0) { | |||
| 76 | if (medh->align_symbol) { | |||
| 77 | printptr += sprintf(medh->outstring+printptr,"%s",medh->align_symbol); | |||
| 78 | } | |||
| 79 | } | |||
| 80 | if (sym == 2) { | |||
| 81 | printptr += sprintf(medh->outstring+printptr,"@"); | |||
| 82 | } | |||
| 83 | } | |||
| 84 | for (n = node; n != NULL((void*)0) ; n = medh->agenda+(n->parent)) { | |||
| 85 | if (n->in == 0 && n->out == 0) | |||
| 86 | break; | |||
| 87 | if (n->parent == -1) | |||
| 88 | break; | |||
| 89 | else | |||
| 90 | int_stack_push(n->out); | |||
| 91 | } | |||
| 92 | printptr = 0; | |||
| 93 | if (medh->instring_length < 2*wordlen) { | |||
| 94 | medh->instring_length *= 2; | |||
| 95 | medh->instring = realloc(medh->instring, medh->instring_length*sizeof(char)); | |||
| 96 | } | |||
| 97 | for (i = 0; !(int_stack_isempty()); ) { | |||
| 98 | sym = int_stack_pop(); | |||
| 99 | if (sym > 2) { | |||
| 100 | printptr += sprintf(medh->instring+printptr,"%s", print_sym(sym, sigma)); | |||
| 101 | i += utf8skip(word+i)+1; | |||
| 102 | } | |||
| 103 | if (sym == 0) { | |||
| 104 | if (medh->align_symbol) { | |||
| 105 | printptr += sprintf(medh->instring+printptr,"%s",medh->align_symbol); | |||
| 106 | } | |||
| 107 | } | |||
| 108 | if (sym == 2) { | |||
| 109 | if (i > wordlen) { | |||
| 110 | printptr += sprintf(medh->instring+printptr,"*"); | |||
| 111 | } else { | |||
| 112 | //printf("%.*s", utf8skip(word+i)+1, word+i); | |||
| 113 | printptr += sprintf(medh->instring+printptr,"%.*s", utf8skip(word+i)+1, word+i); | |||
| 114 | i+= utf8skip(word+i)+1; | |||
| 115 | } | |||
| 116 | } | |||
| 117 | } | |||
| 118 | medh->cost = node->g; | |||
| 119 | // printf("Cost[f]: %i\n\n", node->g); | |||
| 120 | } | |||
| 121 | ||||
| 122 | void apply_med_clear(struct apply_med_handle *medh) { | |||
| 123 | if (medh == NULL((void*)0)) | |||
| 124 | return; | |||
| 125 | if (medh->agenda != NULL((void*)0)) | |||
| 126 | free(medh->agenda); | |||
| 127 | if (medh->instring != NULL((void*)0)) | |||
| 128 | free(medh->instring); | |||
| 129 | if (medh->outstring != NULL((void*)0)) | |||
| 130 | free(medh->outstring); | |||
| 131 | if (medh->heap != NULL((void*)0)) | |||
| 132 | free(medh->heap); | |||
| 133 | if (medh->state_array != NULL((void*)0)) | |||
| 134 | free(medh->state_array); | |||
| 135 | if (medh->align_symbol != NULL((void*)0)) | |||
| 136 | free(medh->align_symbol); | |||
| 137 | if (medh->letterbits != NULL((void*)0)) | |||
| 138 | free(medh->letterbits); | |||
| 139 | if (medh->nletterbits != NULL((void*)0)) | |||
| 140 | free(medh->nletterbits); | |||
| 141 | if (medh->intword != NULL((void*)0)) | |||
| 142 | free(medh->intword); | |||
| 143 | if (medh->sigmahash != NULL((void*)0)) | |||
| 144 | sh_done(medh->sigmahash); | |||
| 145 | free(medh); | |||
| 146 | } | |||
| 147 | ||||
| 148 | struct apply_med_handle *apply_med_init(struct fsm *net) { | |||
| 149 | ||||
| 150 | struct apply_med_handle *medh; | |||
| 151 | struct sigma *sigma; | |||
| 152 | medh = calloc(1,sizeof(struct apply_med_handle)); | |||
| 153 | medh->net = net; | |||
| 154 | medh->agenda = malloc(sizeof(struct astarnode)*INITIAL_AGENDA_SIZE256); | |||
| 155 | medh->agenda->f = -1; | |||
| 156 | medh->agenda_size = INITIAL_AGENDA_SIZE256; | |||
| 157 | ||||
| 158 | medh->heap = malloc(sizeof(int)*INITIAL_HEAP_SIZE256); | |||
| 159 | medh->heap_size = INITIAL_HEAP_SIZE256; | |||
| 160 | *(medh->heap) = 0; /* Points to sentinel */ | |||
| 161 | medh->astarcount = 1; | |||
| 162 | medh->heapcount = 0; | |||
| 163 | medh->state_array = map_firstlines(net); | |||
| 164 | if (net->medlookup != NULL((void*)0) && net->medlookup->confusion_matrix != NULL((void*)0)) { | |||
| 165 | medh->hascm = 1; | |||
| 166 | medh->cm = net->medlookup->confusion_matrix; | |||
| 167 | } | |||
| 168 | medh->maxsigma = sigma_max(net->sigma)+1; | |||
| 169 | medh->sigmahash = sh_init(); | |||
| 170 | for (sigma = net->sigma; sigma != NULL((void*)0) && sigma->number != -1 ; sigma=sigma->next ) { | |||
| 171 | if (sigma->number > IDENTITY2) { | |||
| 172 | sh_add_string(medh->sigmahash, sigma->symbol, sigma->number); | |||
| 173 | } | |||
| 174 | } | |||
| 175 | ||||
| 176 | ||||
| 177 | fsm_create_letter_lookup(medh, net); | |||
| 178 | ||||
| 179 | medh->instring = malloc(sizeof(char)*INITIAL_STRING_SIZE256); | |||
| 180 | medh->instring_length = INITIAL_STRING_SIZE256; | |||
| 181 | medh->outstring = malloc(sizeof(char)*INITIAL_STRING_SIZE256); | |||
| 182 | medh->outstring_length = INITIAL_STRING_SIZE256; | |||
| 183 | ||||
| 184 | medh->med_limit = MED_DEFAULT_LIMIT4; | |||
| 185 | medh->med_cutoff = MED_DEFAULT_CUTOFF15; | |||
| 186 | medh->med_max_heap_size = MED_DEFAULT_MAX_HEAP_SIZE262145; | |||
| 187 | return(medh); | |||
| 188 | } | |||
| 189 | ||||
| 190 | void apply_med_set_heap_max(struct apply_med_handle *medh, int max) { | |||
| 191 | if (medh != NULL((void*)0)) { | |||
| 192 | medh->med_max_heap_size = max; | |||
| 193 | } | |||
| 194 | } | |||
| 195 | ||||
| 196 | void apply_med_set_align_symbol(struct apply_med_handle *medh, char *align) { | |||
| 197 | if (medh != NULL((void*)0)) { | |||
| 198 | medh->align_symbol = strdup(align); | |||
| 199 | } | |||
| 200 | } | |||
| 201 | ||||
| 202 | void apply_med_set_med_limit(struct apply_med_handle *medh, int max) { | |||
| 203 | if (medh != NULL((void*)0)) { | |||
| 204 | medh->med_limit = max; | |||
| 205 | } | |||
| 206 | } | |||
| 207 | ||||
| 208 | void apply_med_set_med_cutoff(struct apply_med_handle *medh, int max) { | |||
| 209 | if (medh != NULL((void*)0)) { | |||
| 210 | medh->med_cutoff = max; | |||
| 211 | } | |||
| 212 | } | |||
| 213 | ||||
| 214 | int apply_med_get_cost(struct apply_med_handle *medh) { | |||
| 215 | return(medh->cost); | |||
| 216 | } | |||
| 217 | ||||
| 218 | char *apply_med_get_instring(struct apply_med_handle *medh) { | |||
| 219 | return(medh->instring); | |||
| 220 | } | |||
| 221 | ||||
| 222 | char *apply_med_get_outstring(struct apply_med_handle *medh) { | |||
| 223 | return(medh->outstring); | |||
| 224 | } | |||
| 225 | ||||
| 226 | char *apply_med(struct apply_med_handle *medh, char *word) { | |||
| 227 | ||||
| 228 | /* local ok: i, j, target, in, out, g, h, curr_node */ | |||
| 229 | /* not ok: curr_ptr, curr_pos, lines, nummatches, nodes_expanded, curr_state */ | |||
| 230 | ||||
| 231 | int i, j, target, in, out, g, h, thisskip; | |||
| 232 | ||||
| 233 | int delcost, subscost, inscost; | |||
| 234 | char temputf[5] ; | |||
| 235 | ||||
| 236 | struct astarnode *curr_node; | |||
| 237 | ||||
| 238 | delcost = subscost = inscost = 1; | |||
| 239 | ||||
| 240 | ||||
| 241 | if (word == NULL((void*)0)) { | |||
| ||||
| 242 | goto resume; | |||
| 243 | } | |||
| 244 | ||||
| 245 | medh->word = word; | |||
| 246 | ||||
| 247 | medh->nodes_expanded = 0; | |||
| 248 | medh->astarcount = 1; | |||
| 249 | medh->heapcount = 0; | |||
| 250 | ||||
| 251 | medh->wordlen = strlen(word); | |||
| 252 | medh->utf8len = utf8strlen(word); | |||
| 253 | if (medh->intword != NULL((void*)0)) { | |||
| 254 | free(medh->intword); | |||
| 255 | } | |||
| 256 | medh->intword = malloc(sizeof(int)*(medh->utf8len+1)); | |||
| 257 | ||||
| 258 | /* intword -> sigma numbers of word */ | |||
| 259 | for (i=0, j=0; i < medh->wordlen; i += thisskip, j++) { | |||
| 260 | thisskip = utf8skip(word+i)+1; | |||
| 261 | strncpy(temputf, word+i, thisskip); | |||
| 262 | temputf[thisskip] = '\0'; | |||
| 263 | if (sh_find_string(medh->sigmahash, temputf) != NULL((void*)0)) { | |||
| 264 | *(medh->intword+j) = sh_get_value(medh->sigmahash); | |||
| 265 | } else { | |||
| 266 | *(medh->intword+j) = IDENTITY2; | |||
| 267 | } | |||
| 268 | } | |||
| 269 | ||||
| 270 | ||||
| 271 | ||||
| 272 | *(medh->intword+j) = -1; /* sentinel */ | |||
| 273 | ||||
| 274 | /* Insert (0,0) g = 0 */ | |||
| 275 | ||||
| 276 | h = calculate_h(medh, medh->intword, 0, 0); | |||
| 277 | ||||
| 278 | /* Root node */ | |||
| 279 | ||||
| 280 | if (!node_insert(medh,0,0,0,h,0,0,-1)) { goto out; } | |||
| 281 | medh->nummatches = 0; | |||
| 282 | ||||
| 283 | for(;;) { | |||
| 284 | ||||
| 285 | curr_node = node_delete_min(medh); | |||
| 286 | /* Need to save this in case we realloc and print_match() */ | |||
| 287 | medh->curr_agenda_offset = curr_node-medh->agenda; | |||
| 288 | if (curr_node
| |||
| 289 | //printf("Reached cutoff of %i.\n", medh->med_cutoff); | |||
| 290 | goto out; | |||
| 291 | } | |||
| 292 | medh->curr_state = curr_node->fsmstate; | |||
| 293 | medh->curr_ptr = (medh->state_array+medh->curr_state)->transitions; | |||
| 294 | if (!medh->curr_ptr->final_state || !(curr_node->wordpos == medh->utf8len)) { | |||
| 295 | //continue; | |||
| 296 | } | |||
| 297 | ||||
| 298 | medh->nodes_expanded++; | |||
| 299 | ||||
| 300 | if (curr_node->f > medh->med_cutoff) { | |||
| 301 | //printf("Reached cutoff of %i\n", medh->med_cutoff); | |||
| 302 | goto out; | |||
| 303 | } | |||
| 304 | ||||
| 305 | medh->curr_pos = curr_node->wordpos; | |||
| 306 | medh->curr_state = curr_node->fsmstate; | |||
| 307 | medh->curr_g = curr_node->g; | |||
| 308 | ||||
| 309 | medh->lines = 0; | |||
| 310 | medh->curr_node_has_match = 0; | |||
| 311 | ||||
| 312 | for (medh->curr_ptr = (medh->state_array+medh->curr_state)->transitions ; ;) { | |||
| 313 | if (medh->curr_ptr->state_no == -1) { | |||
| 314 | break; | |||
| 315 | } | |||
| 316 | medh->lines++; | |||
| 317 | if (medh->curr_ptr->final_state
| |||
| 318 | if (medh->curr_node_has_match == 0) { | |||
| 319 | /* Found a match */ | |||
| 320 | medh->curr_node_has_match = 1; | |||
| 321 | print_match(medh, medh->agenda+medh->curr_agenda_offset, medh->net->sigma, medh->word); | |||
| 322 | medh->nummatches++; | |||
| 323 | return(medh->outstring); | |||
| 324 | } | |||
| 325 | } | |||
| 326 | ||||
| 327 | resume: | |||
| 328 | ||||
| 329 | if (medh->nummatches == medh->med_limit) { | |||
| 330 | goto out; | |||
| 331 | } | |||
| 332 | ||||
| 333 | if (medh->curr_ptr->target == -1 && medh->curr_pos == medh->utf8len) | |||
| 334 | break; | |||
| 335 | if (medh->curr_ptr->target == -1 && medh->lines
| |||
| 336 | goto insert; | |||
| 337 | if (medh->curr_ptr->target == -1) | |||
| 338 | break; | |||
| 339 | ||||
| 340 | target = medh->curr_ptr->target; | |||
| 341 | /* Add nodes to edge:0, edge:input, 0:edge */ | |||
| 342 | ||||
| 343 | /* Delete a symbol from input */ | |||
| 344 | in = medh->curr_ptr->in; | |||
| 345 | out = 0; | |||
| 346 | g = medh->hascm ? medh->curr_g + *(medh->cm+in*medh->maxsigma) : medh->curr_g + delcost; | |||
| 347 | h = calculate_h(medh, medh->intword, medh->curr_pos, medh->curr_ptr->target); | |||
| 348 | ||||
| 349 | if ((medh->curr_pos == medh->utf8len) && (medh->curr_ptr->final_state == 0) && (h == 0)) { | |||
| 350 | // h = 1; | |||
| 351 | } | |||
| 352 | ||||
| 353 | if (g+h <= medh->med_cutoff) { | |||
| 354 | if (!node_insert(medh, medh->curr_pos, target, g, h, in, out, medh->curr_agenda_offset)) { | |||
| 355 | goto out; | |||
| 356 | } | |||
| 357 | } | |||
| 358 | if (medh->curr_pos == medh->utf8len) | |||
| 359 | goto skip; | |||
| 360 | ||||
| 361 | /* Match/substitute */ | |||
| 362 | in = medh->curr_ptr->in; | |||
| 363 | out = *(medh->intword+medh->curr_pos); | |||
| 364 | if (in != out) { | |||
| 365 | g = medh->hascm ? medh->curr_g + *(medh->cm+in*medh->maxsigma+out) : medh->curr_g + subscost; | |||
| 366 | } else { | |||
| 367 | g = medh->curr_g; | |||
| 368 | } | |||
| 369 | ||||
| 370 | h = calculate_h(medh, medh->intword, medh->curr_pos+1, medh->curr_ptr->target); | |||
| 371 | if ((g+h) <= medh->med_cutoff) { | |||
| 372 | if (!node_insert(medh,medh->curr_pos+1, target, g, h, in, out, medh->curr_agenda_offset)) { | |||
| 373 | goto out; | |||
| 374 | } | |||
| 375 | } | |||
| 376 | insert: | |||
| 377 | /* Insert a symbol into input */ | |||
| 378 | /* Can only be done once per state */ | |||
| 379 | ||||
| 380 | if (medh->lines
| |||
| 381 | ||||
| 382 | in = 0; | |||
| 383 | out = *(medh->intword+medh->curr_pos); | |||
| 384 | ||||
| 385 | g = medh->hascm ? medh->curr_g + *(medh->cm+out) : medh->curr_g + inscost; | |||
| 386 | h = calculate_h(medh, medh->intword, medh->curr_pos+1, medh->curr_state); | |||
| 387 | ||||
| 388 | if (g+h <= medh->med_cutoff) | |||
| 389 | if (!node_insert(medh,medh->curr_pos+1, medh->curr_state, g, h, in, out, medh->curr_agenda_offset)) { | |||
| 390 | goto out; | |||
| 391 | } | |||
| 392 | } | |||
| 393 | if (medh->curr_ptr->target == -1) | |||
| 394 | break; | |||
| 395 | skip: | |||
| 396 | if ((medh->curr_ptr+1)->state_no == (medh->curr_ptr)->state_no) | |||
| 397 | medh->curr_ptr++; | |||
| 398 | else | |||
| 399 | break; | |||
| 400 | } | |||
| 401 | } | |||
| 402 | out: | |||
| 403 | return(NULL((void*)0)); | |||
| 404 | } | |||
| 405 | ||||
| 406 | int calculate_h(struct apply_med_handle *medh, int *intword, int currpos, int state) { | |||
| 407 | int i, j, hinf, hn, curr_sym; | |||
| 408 | uint8_t *bitptr, *nbitptr; | |||
| 409 | hinf = 0; | |||
| 410 | hn = 0; | |||
| 411 | ||||
| 412 | bitptr = state*medh->bytes_per_letter_array + medh->letterbits; | |||
| 413 | nbitptr = state*medh->bytes_per_letter_array + medh->nletterbits; | |||
| 414 | ||||
| 415 | /* For n = inf */ | |||
| 416 | if (*(intword+currpos) == -1) | |||
| ||||
| 417 | return 0; | |||
| 418 | for (i = currpos; *(intword+i) != -1; i++) { | |||
| 419 | curr_sym = *(intword+i); | |||
| 420 | if (!BITTEST(bitptr, curr_sym)((bitptr)[((curr_sym) >> 3)] & (1 << ((curr_sym ) & 7)))) { | |||
| 421 | hinf++; | |||
| 422 | } | |||
| 423 | } | |||
| 424 | /* For n = maxdepth */ | |||
| 425 | if (*(intword+currpos) == -1) | |||
| 426 | return 0; | |||
| 427 | for (i = currpos, j = 0; j < medh->maxdepth && *(intword+i) != -1; i++, j++) { | |||
| 428 | curr_sym = *(intword+i); | |||
| 429 | if (!BITTEST(nbitptr, curr_sym)((nbitptr)[((curr_sym) >> 3)] & (1 << ((curr_sym ) & 7)))) { | |||
| 430 | hn++; | |||
| 431 | } | |||
| 432 | } | |||
| 433 | return(hinf > hn ? hinf : hn); | |||
| 434 | } | |||
| 435 | ||||
| 436 | struct astarnode *node_delete_min(struct apply_med_handle *medh) { | |||
| 437 | int i, child; | |||
| 438 | struct astarnode *firstptr, *lastptr; | |||
| 439 | if (medh->heapcount == 0) { | |||
| 440 | return NULL((void*)0); | |||
| 441 | } | |||
| 442 | ||||
| 443 | /* We find the min from the heap */ | |||
| 444 | ||||
| 445 | firstptr = medh->agenda+medh->heap[1]; | |||
| 446 | lastptr = medh->agenda+medh->heap[medh->heapcount]; | |||
| 447 | medh->heapcount--; | |||
| 448 | ||||
| 449 | /* Adjust heap */ | |||
| 450 | for (i = 1; (i<<1) <= medh->heapcount; i = child) { | |||
| 451 | child = i<<1; | |||
| 452 | ||||
| 453 | /* If right child is smaller (higher priority) than left child */ | |||
| 454 | if (child != medh->heapcount && | |||
| 455 | ((medh->agenda+medh->heap[child+1])->f < (medh->agenda+medh->heap[child])->f || | |||
| 456 | ((medh->agenda+medh->heap[child+1])->f <= (medh->agenda+medh->heap[child])->f && | |||
| 457 | (medh->agenda+medh->heap[child+1])->wordpos > (medh->agenda+medh->heap[child])->wordpos))) { | |||
| 458 | child++; | |||
| 459 | } | |||
| 460 | ||||
| 461 | /* If child has lower priority than last element */ | |||
| 462 | if ((medh->agenda+medh->heap[child])->f < lastptr->f || | |||
| 463 | ((medh->agenda+medh->heap[child])->f <= lastptr->f && | |||
| 464 | (medh->agenda+medh->heap[child])->wordpos > lastptr->wordpos)) { | |||
| 465 | ||||
| 466 | medh->heap[i] = medh->heap[child]; | |||
| 467 | } else { | |||
| 468 | break; | |||
| 469 | } | |||
| 470 | } | |||
| 471 | medh->heap[i] = (lastptr-medh->agenda); | |||
| 472 | return(firstptr); | |||
| 473 | } | |||
| 474 | ||||
| 475 | int node_insert(struct apply_med_handle *medh, int wordpos, int fsmstate, int g, int h, int in, int out, int parent) { | |||
| 476 | int i,j,f; | |||
| 477 | /* We add the node in the array */ | |||
| 478 | i = medh->astarcount; | |||
| 479 | if (i >= (medh->agenda_size-1)) { | |||
| 480 | if (medh->agenda_size*2 >= medh->med_max_heap_size) { | |||
| 481 | //printf("heap limit reached by %i\n", medh->med_max_heap_size); | |||
| 482 | return 0; | |||
| 483 | } | |||
| 484 | medh->agenda_size *= 2; | |||
| 485 | medh->agenda = realloc(medh->agenda, sizeof(struct astarnode)*medh->agenda_size); | |||
| 486 | } | |||
| 487 | f = g + h; | |||
| 488 | (medh->agenda+i)->wordpos = wordpos; | |||
| 489 | (medh->agenda+i)->fsmstate = fsmstate; | |||
| 490 | (medh->agenda+i)->f = f; | |||
| 491 | (medh->agenda+i)->g = g; | |||
| 492 | (medh->agenda+i)->h = h; | |||
| 493 | (medh->agenda+i)->in = in; | |||
| 494 | (medh->agenda+i)->out = out; | |||
| 495 | (medh->agenda+i)->parent = parent; | |||
| 496 | medh->astarcount++; | |||
| 497 | ||||
| 498 | /* We also put the ptr on the heap */ | |||
| 499 | medh->heapcount++; | |||
| 500 | ||||
| 501 | if (medh->heapcount == medh->heap_size-1) { | |||
| 502 | medh->heap = realloc(medh->heap, sizeof(int)*medh->heap_size*2); | |||
| 503 | medh->heap_size *= 2; | |||
| 504 | } | |||
| 505 | /* >= makes fifo */ | |||
| 506 | // for (j = heapcount; (agenda+heap[j/2])->f > f; j /= 2) { | |||
| 507 | for (j = medh->heapcount; ( (medh->agenda+medh->heap[j>>1])->f > f) || ((medh->agenda+medh->heap[j>>1])->f >= f && (medh->agenda+medh->heap[j>>1])->wordpos <= wordpos) ; j = j >> 1) { | |||
| 508 | medh->heap[j] = medh->heap[j>>1]; | |||
| 509 | } | |||
| 510 | medh->heap[j] = i; | |||
| 511 | return 1; | |||
| 512 | } | |||
| 513 | ||||
| 514 | ||||
| 515 | void letterbits_union(int v, int vp, uint8_t *ptr, int bytes_per_letter_array) { | |||
| 516 | int i; | |||
| 517 | uint8_t *vptr, *vpptr; | |||
| 518 | vptr = ptr+(v*bytes_per_letter_array); | |||
| 519 | vpptr = ptr+(vp*bytes_per_letter_array); | |||
| 520 | for (i=0; i < bytes_per_letter_array; i++) { | |||
| 521 | *(vptr+i) = *(vptr+i)|*(vpptr+i); | |||
| 522 | } | |||
| 523 | } | |||
| 524 | ||||
| 525 | void letterbits_copy(int source, int target, uint8_t *ptr, int bytes_per_letter_array) { | |||
| 526 | int i; | |||
| 527 | uint8_t *sourceptr, *targetptr; | |||
| 528 | sourceptr = ptr+(source*bytes_per_letter_array); | |||
| 529 | targetptr = ptr+(target*bytes_per_letter_array); | |||
| 530 | for (i=0; i < bytes_per_letter_array; i++) { | |||
| 531 | *(targetptr+i) = *(sourceptr+i); | |||
| 532 | } | |||
| 533 | } | |||
| 534 | ||||
| 535 | void letterbits_add(int v, int symbol, uint8_t *ptr, int bytes_per_letter_array) { | |||
| 536 | uint8_t *vptr; | |||
| 537 | vptr = ptr+(v*bytes_per_letter_array); | |||
| 538 | BITSET(vptr, symbol)((vptr)[((symbol) >> 3)] |= (1 << ((symbol) & 7))); | |||
| 539 | } | |||
| 540 | ||||
| 541 | /* Creates, for each state, a list of symbols that can be matched */ | |||
| 542 | /* somewhere in subsequent paths (the case n = inf) */ | |||
| 543 | /* This is needed for h() of A*-search in approximate matching */ | |||
| 544 | /* This is done by a DFS where each state gets the properties */ | |||
| 545 | /* of the arcs it has as well as a copy of the properties of the target state */ | |||
| 546 | /* At the same time we keep track of the strongly connected components */ | |||
| 547 | /* And copy the properties from each root of the SCC to the children */ | |||
| 548 | /* The SCC part is required for the algorithm to work with cyclic graphs */ | |||
| 549 | /* This is basically a modification of Tarjan's (1972) SCC algorithm */ | |||
| 550 | /* However, it's converted from recursive form to iterative form using */ | |||
| 551 | /* goto statements. Here's the original recursive specification: */ | |||
| 552 | ||||
| 553 | /* Input: FSM = (V,E) */ | |||
| 554 | /* Output: bitvector v.letters for each state */ | |||
| 555 | /* index = 1 DFS node number counter */ | |||
| 556 | ||||
| 557 | /* procedure Mark(v) */ | |||
| 558 | /* v.index = index Set the depth index for v */ | |||
| 559 | /* v.lowlink = index */ | |||
| 560 | /* index++ */ | |||
| 561 | /* push(v) Push v on the stack */ | |||
| 562 | /* forall edge = (v, v') in E do Consider successors of v */ | |||
| 563 | /* v.letters |= v'.letters | edge Copy target v'.letters */ | |||
| 564 | /* if (v'.index == 0) Was successor v' visited? */ | |||
| 565 | /* Mark(v') Recurse */ | |||
| 566 | /* v.lowlink = min(v.lowlink, v'.lowlink) */ | |||
| 567 | /* elseif (v' is on stack) Was successor v' in stack S? */ | |||
| 568 | /* v.lowlink = min(v.lowlink, v'.index) */ | |||
| 569 | /* if (v.lowlink == v.index) Is v the root of a SCC? */ | |||
| 570 | /* do */ | |||
| 571 | /* pop(v') */ | |||
| 572 | /* v'.letters = v.letters */ | |||
| 573 | /* while (v' != v) */ | |||
| 574 | ||||
| 575 | /* For keeping track of the strongly connected components */ | |||
| 576 | /* when doing the DFS */ | |||
| 577 | ||||
| 578 | struct sccinfo { | |||
| 579 | int index; | |||
| 580 | int lowlink; | |||
| 581 | int on_t_stack; | |||
| 582 | }; | |||
| 583 | ||||
| 584 | void fsm_create_letter_lookup(struct apply_med_handle *medh, struct fsm *net) { | |||
| 585 | ||||
| 586 | int num_states, num_symbols, index, v, vp, copystate, i, j; | |||
| 587 | struct fsm_state *curr_ptr; | |||
| 588 | struct sccinfo *sccinfo; | |||
| 589 | int depth; | |||
| 590 | medh->maxdepth = 2; | |||
| 591 | ||||
| 592 | num_states = net->statecount; | |||
| 593 | num_symbols = sigma_max(net->sigma); | |||
| 594 | ||||
| 595 | medh->bytes_per_letter_array = BITNSLOTS(num_symbols+1)((num_symbols+1 + 8 - 1) / 8); | |||
| 596 | medh->letterbits = calloc(medh->bytes_per_letter_array*num_states,sizeof(uint8_t)); | |||
| 597 | medh->nletterbits = calloc(medh->bytes_per_letter_array*num_states,sizeof(uint8_t)); | |||
| 598 | ||||
| 599 | sccinfo = calloc(num_states,sizeof(struct sccinfo)); | |||
| 600 | ||||
| 601 | index = 1; | |||
| 602 | curr_ptr = net->states; | |||
| 603 | goto l1; | |||
| 604 | ||||
| 605 | /* Here we go again, converting a recursive algorithm to an iterative one */ | |||
| 606 | /* by gotos */ | |||
| 607 | ||||
| 608 | while(!ptr_stack_isempty()) { | |||
| 609 | ||||
| 610 | curr_ptr = ptr_stack_pop(); | |||
| 611 | ||||
| 612 | v = curr_ptr->state_no; /* source state number */ | |||
| 613 | vp = curr_ptr->target; /* target state number */ | |||
| 614 | ||||
| 615 | /* T: v.letterlist = list_union(v'->list, current edge label) */ | |||
| 616 | ||||
| 617 | letterbits_union(v, vp, medh->letterbits,medh->bytes_per_letter_array); /* add v' target bits to v */ | |||
| 618 | letterbits_add(v, curr_ptr->in, medh->letterbits,medh->bytes_per_letter_array); /* add current arc label to v */ | |||
| 619 | ||||
| 620 | (sccinfo+v)->lowlink = min_((sccinfo+v)->lowlink,(sccinfo+vp)->lowlink)(((sccinfo+v)->lowlink) < ((sccinfo+vp)->lowlink) ? ( (sccinfo+v)->lowlink) : ((sccinfo+vp)->lowlink)); | |||
| 621 | ||||
| 622 | if ((curr_ptr+1)->state_no != curr_ptr->state_no) { | |||
| 623 | goto l4; | |||
| 624 | } else { | |||
| 625 | goto l3; | |||
| 626 | } | |||
| 627 | ||||
| 628 | l1: | |||
| 629 | v = curr_ptr->state_no; | |||
| 630 | vp = curr_ptr->target; /* target */ | |||
| 631 | /* T: v.lowlink = index, index++, Tpush(v) */ | |||
| 632 | (sccinfo+v)->index = index; | |||
| 633 | (sccinfo+v)->lowlink = index; | |||
| 634 | index++; | |||
| 635 | int_stack_push(v); | |||
| 636 | (sccinfo+v)->on_t_stack = 1; | |||
| 637 | /* if v' not visited (is v'.index set) */ | |||
| 638 | ||||
| 639 | /* If done here, check lowlink, pop */ | |||
| 640 | ||||
| 641 | if (vp == -1) | |||
| 642 | goto l4; | |||
| 643 | l2: | |||
| 644 | letterbits_add(v, curr_ptr->in, medh->letterbits,medh->bytes_per_letter_array); | |||
| 645 | if ((sccinfo+vp)->index == 0) { | |||
| 646 | /* push (v,e) ptr on stack */ | |||
| 647 | ptr_stack_push(curr_ptr); | |||
| 648 | curr_ptr = (medh->state_array+(curr_ptr->target))->transitions; | |||
| 649 | /* (v,e) = (v',firstedge), goto init */ | |||
| 650 | goto l1; | |||
| 651 | /* if v' visited */ | |||
| 652 | /* T: v.lowlink = min(v.lowlink, v'.lowlink), union v.list with e, move to next edge in v, goto loop */ | |||
| 653 | } else if ((sccinfo+vp)->on_t_stack) { | |||
| 654 | (sccinfo+v)->lowlink = min_((sccinfo+v)->lowlink,(sccinfo+vp)->lowlink)(((sccinfo+v)->lowlink) < ((sccinfo+vp)->lowlink) ? ( (sccinfo+v)->lowlink) : ((sccinfo+vp)->lowlink)); | |||
| 655 | } | |||
| 656 | /* If node is visited, copy its bits */ | |||
| 657 | letterbits_union(v,vp,medh->letterbits,medh->bytes_per_letter_array); | |||
| 658 | ||||
| 659 | l3: | |||
| 660 | if ((curr_ptr+1)->state_no == curr_ptr->state_no) { | |||
| 661 | curr_ptr++; | |||
| 662 | v = curr_ptr->state_no; | |||
| 663 | vp = curr_ptr->target; /* target */ | |||
| 664 | goto l2; | |||
| 665 | } | |||
| 666 | ||||
| 667 | ||||
| 668 | /* T: if lastedge, v.lowlink == v.index, pop Tstack until v is popped and copy v.list to others */ | |||
| 669 | /* Copy all bits from root of SCC to descendants */ | |||
| 670 | l4: | |||
| 671 | if ((sccinfo+v)->lowlink == (sccinfo+v)->index) { | |||
| 672 | //printf("\nSCC: [%i] ",v); | |||
| 673 | while((copystate = int_stack_pop()) != v) { | |||
| 674 | (sccinfo+copystate)->on_t_stack = 0; | |||
| 675 | letterbits_copy(v, copystate, medh->letterbits, medh->bytes_per_letter_array); | |||
| 676 | //printf("%i ", copystate); | |||
| 677 | } | |||
| 678 | (sccinfo+v)->on_t_stack = 0; | |||
| 679 | //printf("\n"); | |||
| 680 | } | |||
| 681 | } | |||
| 682 | ||||
| 683 | for (i=0; i < num_states; i++) { | |||
| 684 | //printf("State %i: ",i); | |||
| 685 | for (j=0; j <= num_symbols; j++) { | |||
| 686 | if (BITTEST(medh->letterbits+(i*medh->bytes_per_letter_array),j)((medh->letterbits+(i*medh->bytes_per_letter_array))[(( j) >> 3)] & (1 << ((j) & 7)))) { | |||
| 687 | //printf("[%i]",j); | |||
| 688 | } | |||
| 689 | } | |||
| 690 | //printf("\n"); | |||
| 691 | } | |||
| 692 | int_stack_clear(); | |||
| 693 | ||||
| 694 | /* We do the same thing for some finite n (up to maxdepth) */ | |||
| 695 | /* and store the result in nletterbits */ | |||
| 696 | ||||
| 697 | for (v=0; v < num_states; v++) { | |||
| 698 | ptr_stack_push((medh->state_array+v)->transitions); | |||
| 699 | int_stack_push(0); | |||
| 700 | while (!ptr_stack_isempty()) { | |||
| 701 | curr_ptr = ptr_stack_pop(); | |||
| 702 | depth = int_stack_pop(); | |||
| 703 | looper: | |||
| 704 | if (depth == medh->maxdepth) | |||
| 705 | continue; | |||
| 706 | if (curr_ptr->in != -1) { | |||
| 707 | letterbits_add(v, curr_ptr->in, medh->nletterbits, medh->bytes_per_letter_array); | |||
| 708 | } | |||
| 709 | if (curr_ptr->target != -1) { | |||
| 710 | if (curr_ptr->state_no == (curr_ptr+1)->state_no) { | |||
| 711 | ptr_stack_push(curr_ptr+1); | |||
| 712 | int_stack_push(depth); | |||
| 713 | } | |||
| 714 | depth++; | |||
| 715 | curr_ptr = (medh->state_array+(curr_ptr->target))->transitions; | |||
| 716 | goto looper; | |||
| 717 | } | |||
| 718 | } | |||
| 719 | } | |||
| 720 | for (i=0; i < num_states; i++) { | |||
| 721 | //printf("State %i: ",i); | |||
| 722 | for (j=0; j <= num_symbols; j++) { | |||
| 723 | if (BITTEST(medh->nletterbits+(i*medh->bytes_per_letter_array),j)((medh->nletterbits+(i*medh->bytes_per_letter_array))[( (j) >> 3)] & (1 << ((j) & 7)))) { | |||
| 724 | //printf("[%i]",j); | |||
| 725 | } | |||
| 726 | } | |||
| 727 | //printf("\n"); | |||
| 728 | } | |||
| 729 | free(sccinfo); | |||
| 730 | } | |||
| 731 | ||||
| 732 | void cmatrix_print_att(struct fsm *net, FILE *outfile) { | |||
| 733 | int i, j, *cm, maxsigma; | |||
| 734 | maxsigma = sigma_max(net->sigma) + 1; | |||
| 735 | cm = net->medlookup->confusion_matrix; | |||
| 736 | ||||
| 737 | ||||
| 738 | for (i = 0; i < maxsigma ; i++) { | |||
| 739 | for (j = 0; j < maxsigma ; j++) { | |||
| 740 | if ((i != 0 && i < 3) || (j != 0 && j < 3)) { continue; } | |||
| 741 | if (i == 0 && j != 0) { | |||
| 742 | fprintf(outfile,"0\t0\t%s\t%s\t%i\n", "@0@", sigma_string(j, net->sigma), *(cm+i*maxsigma+j)); | |||
| 743 | } else if (j == 0 && i != 0) { | |||
| 744 | fprintf(outfile,"0\t0\t%s\t%s\t%i\n", sigma_string(i,net->sigma), "@0@", *(cm+i*maxsigma+j)); | |||
| 745 | } else if (j != 0 && i != 0) { | |||
| 746 | fprintf(outfile,"0\t0\t%s\t%s\t%i\n", sigma_string(i,net->sigma),sigma_string(j, net->sigma), *(cm+i*maxsigma+j)); | |||
| 747 | } | |||
| 748 | } | |||
| 749 | } | |||
| 750 | fprintf(outfile,"0\n"); | |||
| 751 | } | |||
| 752 | ||||
| 753 | void cmatrix_print(struct fsm *net) { | |||
| 754 | int lsymbol, i, j, *cm, maxsigma; | |||
| 755 | char *thisstring; | |||
| 756 | struct sigma *sigma; | |||
| 757 | maxsigma = sigma_max(net->sigma) + 1; | |||
| 758 | cm = net->medlookup->confusion_matrix; | |||
| 759 | ||||
| 760 | lsymbol = 0 ; | |||
| 761 | for (sigma = net->sigma; sigma != NULL((void*)0); sigma = sigma->next) { | |||
| 762 | if (sigma->number < 3) | |||
| 763 | continue; | |||
| 764 | lsymbol = strlen(sigma->symbol) > lsymbol ? strlen(sigma->symbol) : lsymbol; | |||
| 765 | } | |||
| 766 | printf("%*s",lsymbol+2,""); | |||
| 767 | printf("%s","0 "); | |||
| 768 | ||||
| 769 | for (i = 3; ; i++) { | |||
| 770 | if ((thisstring = sigma_string(i, net->sigma)) != NULL((void*)0)) { | |||
| 771 | printf("%s ",thisstring); | |||
| 772 | } else { | |||
| 773 | break; | |||
| 774 | } | |||
| 775 | } | |||
| 776 | ||||
| 777 | printf("\n"); | |||
| 778 | ||||
| 779 | for (i = 0; i < maxsigma ; i++) { | |||
| 780 | for (j = 0; j < maxsigma ; j++) { | |||
| 781 | if (j == 0) { | |||
| 782 | if (i == 0) { | |||
| 783 | printf("%*s",lsymbol+1,"0"); | |||
| 784 | printf("%*s",2,"*"); | |||
| 785 | } else { | |||
| 786 | printf("%*s",lsymbol+1, sigma_string(i, net->sigma)); | |||
| 787 | printf("%*d",2,*(cm+i*maxsigma+j)); | |||
| 788 | } | |||
| 789 | j++; | |||
| 790 | j++; | |||
| 791 | continue; | |||
| 792 | } | |||
| 793 | if (i == j) { | |||
| 794 | printf("%.*s",(int)strlen(sigma_string(j, net->sigma))+1,"*"); | |||
| 795 | } else { | |||
| 796 | printf("%.*d",(int)strlen(sigma_string(j, net->sigma))+1,*(cm+i*maxsigma+j)); | |||
| 797 | } | |||
| 798 | } | |||
| 799 | printf("\n"); | |||
| 800 | if (i == 0) { | |||
| 801 | i++; i++; | |||
| 802 | } | |||
| 803 | } | |||
| 804 | } | |||
| 805 | ||||
| 806 | void cmatrix_init(struct fsm *net) { | |||
| 807 | int maxsigma, *cm, i, j; | |||
| 808 | if (net->medlookup == NULL((void*)0)) { | |||
| 809 | net->medlookup = calloc(1,sizeof(struct medlookup)); | |||
| 810 | } | |||
| 811 | maxsigma = sigma_max(net->sigma)+1; | |||
| 812 | cm = calloc(maxsigma*maxsigma, sizeof(int)); | |||
| 813 | net->medlookup->confusion_matrix = cm; | |||
| 814 | for (i = 0; i < maxsigma; i++) { | |||
| 815 | for (j = 0; j < maxsigma; j++) { | |||
| 816 | if (i == j) | |||
| 817 | *(cm+i*maxsigma+j) = 0; | |||
| 818 | else | |||
| 819 | *(cm+i*maxsigma+j) = 1; | |||
| 820 | } | |||
| 821 | } | |||
| 822 | } | |||
| 823 | ||||
| 824 | void cmatrix_default_substitute(struct fsm *net, int cost) { | |||
| 825 | int i, j, maxsigma, *cm; | |||
| 826 | cm = net->medlookup->confusion_matrix; | |||
| 827 | maxsigma = sigma_max(net->sigma)+1; | |||
| 828 | for (i = 1; i < maxsigma; i++) { | |||
| 829 | for (j = 1; j < maxsigma; j++) { | |||
| 830 | if (i == j) { | |||
| 831 | *(cm+i*maxsigma+j) = 0; | |||
| 832 | } else { | |||
| 833 | *(cm+i*maxsigma+j) = cost; | |||
| 834 | } | |||
| 835 | } | |||
| 836 | } | |||
| 837 | } | |||
| 838 | ||||
| 839 | void cmatrix_default_insert(struct fsm *net, int cost) { | |||
| 840 | int i, maxsigma, *cm; | |||
| 841 | cm = net->medlookup->confusion_matrix; | |||
| 842 | maxsigma = sigma_max(net->sigma)+1; | |||
| 843 | for (i = 0; i < maxsigma; i++) { | |||
| 844 | *(cm+i) = cost; | |||
| 845 | } | |||
| 846 | } | |||
| 847 | ||||
| 848 | void cmatrix_default_delete(struct fsm *net, int cost) { | |||
| 849 | int i, maxsigma, *cm; | |||
| 850 | cm = net->medlookup->confusion_matrix; | |||
| 851 | maxsigma = sigma_max(net->sigma)+1; | |||
| 852 | for (i = 0; i < maxsigma; i++) { | |||
| 853 | *(cm+i*maxsigma) = cost; | |||
| 854 | } | |||
| 855 | } | |||
| 856 | ||||
| 857 | void cmatrix_set_cost(struct fsm *net, char *in, char *out, int cost) { | |||
| 858 | int i, o, maxsigma, *cm; | |||
| 859 | cm = net->medlookup->confusion_matrix; | |||
| 860 | maxsigma = sigma_max(net->sigma) + 1; | |||
| 861 | if (in == NULL((void*)0)) { | |||
| 862 | i = 0; | |||
| 863 | } else { | |||
| 864 | i = sigma_find(in, net->sigma); | |||
| 865 | } | |||
| 866 | if (out == NULL((void*)0)) { | |||
| 867 | o = 0; | |||
| 868 | } else { | |||
| 869 | o = sigma_find(out, net->sigma); | |||
| 870 | } | |||
| 871 | if (i == -1) { | |||
| 872 | printf("Warning, symbol '%s' not in alphabet\n",in); | |||
| 873 | return; | |||
| 874 | } | |||
| 875 | if (o == -1) { | |||
| 876 | printf("Warning, symbol '%s' not in alphabet\n",out); | |||
| 877 | return; | |||
| 878 | } | |||
| 879 | *(cm+i*maxsigma+o) = cost; | |||
| 880 | } |