| File: | io.c |
| Warning: | line 607, column 9 Although the value stored to 'i' is used in the enclosing expression, the value is never actually read from 'i' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | /* Foma: a finite-state toolkit and library. */ |
| 2 | /* Copyright © 2008-2021 Mans Hulden */ |
| 3 | |
| 4 | /* This file is part of foma. */ |
| 5 | |
| 6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ |
| 7 | /* you may not use this file except in compliance with the License. */ |
| 8 | /* You may obtain a copy of the License at */ |
| 9 | |
| 10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ |
| 11 | |
| 12 | /* Unless required by applicable law or agreed to in writing, software */ |
| 13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ |
| 14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ |
| 15 | /* See the License for the specific language governing permissions and */ |
| 16 | /* limitations under the License. */ |
| 17 | |
| 18 | #include <stdio.h> |
| 19 | #include <string.h> |
| 20 | #include <stdlib.h> |
| 21 | #include <stdarg.h> |
| 22 | #include "foma.h" |
| 23 | #include "zlib.h" |
| 24 | |
| 25 | #define TYPE_TRANSITION1 1 |
| 26 | #define TYPE_SYMBOL2 2 |
| 27 | #define TYPE_FINAL3 3 |
| 28 | #define TYPE_PROPERTY4 4 |
| 29 | #define TYPE_END5 5 |
| 30 | #define TYPE_ERROR6 6 |
| 31 | |
| 32 | #define READ_BUF_SIZE4096 4096 |
| 33 | |
| 34 | struct binaryline { |
| 35 | int type; |
| 36 | int state; |
| 37 | int in; |
| 38 | int target; |
| 39 | int out; |
| 40 | int symbol; |
| 41 | char *name; |
| 42 | char *value; |
| 43 | }; |
| 44 | |
| 45 | extern char *g_att_epsilon; |
| 46 | |
| 47 | struct io_buf_handle { |
| 48 | char *io_buf; |
| 49 | char *io_buf_ptr; |
| 50 | }; |
| 51 | |
| 52 | struct io_buf_handle *io_init(); |
| 53 | void io_free(struct io_buf_handle *iobh); |
| 54 | static int io_gets(struct io_buf_handle *iobh, char *target); |
| 55 | static size_t io_get_gz_file_size(char *filename); |
| 56 | static size_t io_get_file_size(char *filename); |
| 57 | static size_t io_get_regular_file_size(char *filename); |
| 58 | size_t io_gz_file_to_mem (struct io_buf_handle *iobh, char *filename); |
| 59 | int foma_net_print(struct fsm *net, gzFile outfile); |
| 60 | struct fsm *io_net_read(struct io_buf_handle *iobh, char **net_name); |
| 61 | static INLINEinline int explode_line (char *buf, int *values); |
| 62 | |
| 63 | |
| 64 | void escape_print(FILE *stream, char* string) { |
| 65 | int i; |
| 66 | if (strchr(string, '"') != NULL((void*)0)) { |
| 67 | for (i = 0; *(string+i) != '\0'; i++) { |
| 68 | if (*(string+i) == '"') { |
| 69 | fprintf(stream, "\\\""); |
| 70 | } else { |
| 71 | fputc(*(string+i), stream); |
| 72 | } |
| 73 | } |
| 74 | } else { |
| 75 | fprintf(stream, "%s", string); |
| 76 | } |
| 77 | } |
| 78 | |
| 79 | int foma_write_prolog (struct fsm *net, char *filename) { |
| 80 | struct fsm_state *stateptr; |
| 81 | int i, *finals, *used_symbols, maxsigma; |
| 82 | FILE *out; |
| 83 | char *outstring, *instring, identifier[100]; |
| 84 | |
| 85 | if (filename == NULL((void*)0)) { |
| 86 | out = stdoutstdout; |
| 87 | } else { |
| 88 | if ((out = fopen(filename, "w")) == NULL((void*)0)) { |
| 89 | printf("Error writing to file '%s'. Using stdout.\n", filename); |
| 90 | out = stdoutstdout; |
| 91 | } |
| 92 | printf("Writing prolog to file '%s'.\n", filename); |
| 93 | } |
| 94 | fsm_count(net); |
| 95 | maxsigma = sigma_max(net->sigma); |
| 96 | used_symbols = calloc(maxsigma+1,sizeof(int)); |
| 97 | finals = malloc(sizeof(int)*(net->statecount)); |
| 98 | stateptr = net->states; |
| 99 | identifier[0] = '\0'; |
| 100 | |
| 101 | strcpy(identifier, net->name); |
| 102 | |
| 103 | /* Print identifier */ |
| 104 | fprintf(out, "%s%s%s", "network(",identifier,").\n"); |
| 105 | |
| 106 | for (i=0; (stateptr+i)->state_no != -1; i++) { |
| 107 | if ((stateptr+i)->final_state == 1) { |
| 108 | *(finals+((stateptr+i)->state_no)) = 1; |
| 109 | } else { |
| 110 | *(finals+((stateptr+i)->state_no)) = 0; |
| 111 | } |
| 112 | if ((stateptr+i)->in != -1) { |
| 113 | *(used_symbols+((stateptr+i)->in)) = 1; |
| 114 | } |
| 115 | if ((stateptr+i)->out != -1) { |
| 116 | *(used_symbols+((stateptr+i)->out)) = 1; |
| 117 | } |
| 118 | |
| 119 | } |
| 120 | |
| 121 | for (i = 3; i <= maxsigma; i++) { |
| 122 | if (*(used_symbols+i) == 0) { |
| 123 | instring = sigma_string(i, net->sigma); |
| 124 | if (strcmp(instring,"0") == 0) { |
| 125 | instring = "%0"; |
| 126 | } |
| 127 | fprintf(out, "symbol(%s, \"", identifier); |
| 128 | escape_print(out, instring); |
| 129 | fprintf(out, "\").\n"); |
| 130 | |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | for (; stateptr->state_no != -1; stateptr++) { |
| 135 | if (stateptr->target == -1) |
| 136 | continue; |
| 137 | fprintf(out, "arc(%s, %i, %i, ", identifier, stateptr->state_no, stateptr->target); |
| 138 | if (stateptr->in == 0) instring = "0"; |
| 139 | else if (stateptr->in == 1) instring = "?"; |
| 140 | else if (stateptr->in == 2) instring = "?"; |
| 141 | else instring = sigma_string(stateptr->in, net->sigma); |
| 142 | if (stateptr->out == 0) outstring = "0"; |
| 143 | else if (stateptr->out == 1) outstring = "?"; |
| 144 | else if (stateptr->out == 2) outstring = "?"; |
| 145 | else outstring = sigma_string(stateptr->out, net->sigma); |
| 146 | |
| 147 | if (strcmp(instring,"0") == 0 && stateptr->in != 0) instring = "%0"; |
| 148 | if (strcmp(outstring,"0") == 0 && stateptr->out != 0) outstring = "%0"; |
| 149 | if (strcmp(instring,"?") == 0 && stateptr->in > 2) instring = "%?"; |
| 150 | if (strcmp(outstring,"?") == 0 && stateptr->in > 2) outstring = "%?"; |
| 151 | /* Escape quotes */ |
| 152 | |
| 153 | if (net->arity == 2 && stateptr->in == IDENTITY2 && stateptr->out == IDENTITY2) { |
| 154 | fprintf(out, "\"?\").\n"); |
| 155 | } |
| 156 | else if (net->arity == 2 && stateptr->in == stateptr->out && stateptr->in != UNKNOWN1) { |
| 157 | fprintf(out, "\""); |
| 158 | escape_print(out, instring); |
| 159 | fprintf(out, "\").\n"); |
| 160 | } |
| 161 | else if (net->arity == 2) { |
| 162 | fprintf(out, "\""); |
| 163 | escape_print(out, instring); |
| 164 | fprintf(out, "\":\""); |
| 165 | escape_print(out, outstring); |
| 166 | fprintf(out, "\").\n"); |
| 167 | } |
| 168 | else if (net->arity == 1) { |
| 169 | fprintf(out, "\""); |
| 170 | escape_print(out, instring); |
| 171 | fprintf(out, "\").\n"); |
| 172 | } |
| 173 | } |
| 174 | |
| 175 | for (i = 0; i < net->statecount; i++) { |
| 176 | if (*(finals+i)) { |
| 177 | fprintf(out, "final(%s, %i).\n", identifier, i); |
| 178 | } |
| 179 | } |
| 180 | if (filename != NULL((void*)0)) { |
| 181 | fclose(out); |
| 182 | } |
| 183 | free(finals); |
| 184 | free(used_symbols); |
| 185 | return 1; |
| 186 | } |
| 187 | |
| 188 | struct fsm *read_att(char *filename) { |
| 189 | |
| 190 | struct fsm_construct_handle *h; |
| 191 | struct fsm *net; |
| 192 | int i; |
| 193 | char inword[1024], delimiters[] = "\t", *tokens[6]; |
| 194 | FILE *INFILE; |
| 195 | |
| 196 | INFILE = fopen(filename, "r"); |
| 197 | if (INFILE == NULL((void*)0)) { |
| 198 | return(NULL((void*)0)); |
| 199 | } |
| 200 | |
| 201 | h = fsm_construct_init(filename); |
| 202 | while (fgets(inword, 1024, INFILE) != NULL((void*)0)) { |
| 203 | if (inword[strlen(inword)-1] == '\n') { |
| 204 | inword[strlen(inword)-1] = '\0'; |
| 205 | } |
| 206 | tokens[0] = strtok(inword, delimiters); |
| 207 | i = 0; |
| 208 | if (tokens[0] != NULL((void*)0)) { |
| 209 | i = 1; |
| 210 | for ( ; ; ) { |
| 211 | tokens[i] = strtok(NULL((void*)0), delimiters); |
| 212 | if (tokens[i] == NULL((void*)0)) { |
| 213 | break; |
| 214 | } |
| 215 | i++; |
| 216 | if (i == 6) |
| 217 | break; |
| 218 | } |
| 219 | } |
| 220 | if (i == 0) { continue; } |
| 221 | if (i >= 4) { |
| 222 | if (strcmp(tokens[2],g_att_epsilon) == 0) |
| 223 | tokens[2] = "@_EPSILON_SYMBOL_@"; |
| 224 | if (strcmp(tokens[3],g_att_epsilon) == 0) |
| 225 | tokens[3] = "@_EPSILON_SYMBOL_@"; |
| 226 | |
| 227 | fsm_construct_add_arc(h, atoi(tokens[0]), atoi(tokens[1]), tokens[2], tokens[3]); |
| 228 | } |
| 229 | else if (i <= 3 && i > 0) { |
| 230 | fsm_construct_set_final(h,atoi(tokens[0])); |
| 231 | } |
| 232 | } |
| 233 | fsm_construct_set_initial(h,0); |
| 234 | fclose(INFILE); |
| 235 | net = fsm_construct_done(h); |
| 236 | fsm_count(net); |
| 237 | net = fsm_topsort(net); |
| 238 | return(net); |
| 239 | } |
| 240 | |
| 241 | struct fsm *fsm_read_prolog (char *filename) { |
| 242 | char buf [1024], temp [1024], in [128], out[128], *temp_ptr, *temp_ptr2; |
| 243 | int arity, source, target, has_net; |
| 244 | struct fsm *outnet; |
| 245 | struct fsm_construct_handle *outh = NULL((void*)0); |
| 246 | FILE *prolog_file; |
| 247 | |
| 248 | has_net = 0; |
| 249 | prolog_file = fopen(filename, "r"); |
| 250 | if (prolog_file == NULL((void*)0)) { |
| 251 | return NULL((void*)0); |
| 252 | } |
| 253 | |
| 254 | while (fgets(buf, 1023, prolog_file) != NULL((void*)0)) { |
| 255 | if (strstr(buf, "network(") == buf) { |
| 256 | /* Extract network name */ |
| 257 | if (has_net == 1) { |
| 258 | perror("WARNING: prolog file contains multiple nets. Only returning the first one.\n"); |
| 259 | break; |
| 260 | } else { |
| 261 | has_net = 1; |
| 262 | } |
| 263 | temp_ptr = strstr(buf, "network(")+8; |
| 264 | temp_ptr2 = strstr(buf, ")."); |
| 265 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
| 266 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
| 267 | |
| 268 | /* Start network */ |
| 269 | outh = fsm_construct_init(temp); |
| 270 | } |
| 271 | if (strstr(buf, "final(") == buf) { |
| 272 | temp_ptr = strstr(buf, " "); |
| 273 | temp_ptr++; |
| 274 | temp_ptr2 = strstr(temp_ptr, ")."); |
| 275 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
| 276 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
| 277 | |
| 278 | fsm_construct_set_final(outh, atoi(temp)); |
| 279 | } |
| 280 | if (strstr(buf, "symbol(") == buf) { |
| 281 | temp_ptr = strstr(buf, ", \"")+3; |
| 282 | temp_ptr2 = strstr(temp_ptr, "\")."); |
| 283 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
| 284 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
| 285 | if (strcmp(temp, "%0") == 0) |
| 286 | strcpy(temp, "0"); |
| 287 | //printf("special: %s\n",temp); |
| 288 | |
| 289 | if (fsm_construct_check_symbol(outh, temp) == -1) { |
| 290 | fsm_construct_add_symbol(outh, temp); |
| 291 | } |
| 292 | continue; |
| 293 | } |
| 294 | if (strstr(buf, "arc(") == buf) { |
| 295 | in[0] = '\0'; |
| 296 | out[0] = '\0'; |
| 297 | |
| 298 | if (strstr(buf, "\":\"") == NULL((void*)0) || strstr(buf, ", \":\").") != NULL((void*)0)) { |
| 299 | arity = 1; |
| 300 | } else { |
| 301 | arity = 2; |
| 302 | } |
| 303 | |
| 304 | /* Get source */ |
| 305 | temp_ptr = strstr(buf, " "); |
| 306 | temp_ptr++; |
| 307 | temp_ptr2 = strstr(temp_ptr, ","); |
| 308 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
| 309 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
| 310 | source = atoi(temp); |
| 311 | |
| 312 | /* Get target */ |
| 313 | temp_ptr = strstr(temp_ptr2, " "); |
| 314 | temp_ptr++; |
| 315 | temp_ptr2 = strstr(temp_ptr, ","); |
| 316 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
| 317 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
| 318 | target = atoi(temp); |
| 319 | |
| 320 | temp_ptr = strstr(temp_ptr2, "\""); |
| 321 | temp_ptr++; |
| 322 | if (arity == 2) { |
| 323 | temp_ptr2 = strstr(temp_ptr, "\":"); |
| 324 | } else { |
| 325 | temp_ptr2 = strstr(temp_ptr, "\")."); |
| 326 | } |
| 327 | strncpy(in, temp_ptr, (temp_ptr2 - temp_ptr)); |
| 328 | in[(temp_ptr2 - temp_ptr)] = '\0'; |
| 329 | |
| 330 | if (arity == 2) { |
| 331 | temp_ptr = strstr(temp_ptr2, ":\""); |
| 332 | temp_ptr += 2; |
| 333 | temp_ptr2 = strstr(temp_ptr, "\")."); |
| 334 | strncpy(out, temp_ptr, (temp_ptr2 - temp_ptr)); |
| 335 | out[(temp_ptr2 - temp_ptr)] = '\0'; |
| 336 | } |
| 337 | if (arity == 1 && (strcmp(in, "?") == 0)) { |
| 338 | strcpy(in,"@_IDENTITY_SYMBOL_@"); |
| 339 | } |
| 340 | if (arity == 2 && (strcmp(in, "?") == 0)) { |
| 341 | strcpy(in,"@_UNKNOWN_SYMBOL_@"); |
| 342 | } |
| 343 | if (arity == 2 && (strcmp(out, "?") == 0)) { |
| 344 | strcpy(out,"@_UNKNOWN_SYMBOL_@"); |
| 345 | } |
| 346 | if (strcmp(in, "0") == 0) { |
| 347 | strcpy(in,"@_EPSILON_SYMBOL_@"); |
| 348 | } |
| 349 | if (strcmp(out, "0") == 0) { |
| 350 | strcpy(out,"@_EPSILON_SYMBOL_@"); |
| 351 | } |
| 352 | if (strcmp(in, "%0") == 0) { |
| 353 | strcpy(in,"0"); |
| 354 | } |
| 355 | if (strcmp(out, "%0") == 0) { |
| 356 | strcpy(out,"0"); |
| 357 | } |
| 358 | if (strcmp(in, "%?") == 0) { |
| 359 | strcpy(in,"?"); |
| 360 | } |
| 361 | if (strcmp(out, "%?") == 0) { |
| 362 | strcpy(out,"?"); |
| 363 | } |
| 364 | |
| 365 | if (arity == 1) { |
| 366 | fsm_construct_add_arc(outh, source, target, in, in); |
| 367 | } else { |
| 368 | fsm_construct_add_arc(outh, source, target, in, out); |
| 369 | } |
| 370 | } |
| 371 | } |
| 372 | fclose(prolog_file); |
| 373 | if (has_net == 1) { |
| 374 | fsm_construct_set_initial(outh, 0); |
| 375 | outnet = fsm_construct_done(outh); |
| 376 | fsm_topsort(outnet); |
| 377 | return(outnet); |
| 378 | } else { |
| 379 | return(NULL((void*)0)); |
| 380 | } |
| 381 | } |
| 382 | |
| 383 | struct io_buf_handle *io_init() { |
| 384 | struct io_buf_handle *iobh; |
| 385 | iobh = malloc(sizeof(struct io_buf_handle)); |
| 386 | (iobh->io_buf) = NULL((void*)0); |
| 387 | (iobh->io_buf_ptr) = NULL((void*)0); |
| 388 | return(iobh); |
| 389 | } |
| 390 | |
| 391 | void io_free(struct io_buf_handle *iobh) { |
| 392 | if (iobh->io_buf != NULL((void*)0)) { |
| 393 | free(iobh->io_buf); |
| 394 | (iobh->io_buf) = NULL((void*)0); |
| 395 | } |
| 396 | free(iobh); |
| 397 | } |
| 398 | |
| 399 | char *spacedtext_get_next_line(char **text) { |
| 400 | char *t, *ret; |
| 401 | ret = *text; |
| 402 | if (**text == '\0') |
| 403 | return NULL((void*)0); |
| 404 | for (t = *text; *t != '\0' && *t != '\n'; t++) { |
| 405 | } |
| 406 | if (*t == '\0') |
| 407 | *text = t; |
| 408 | else |
| 409 | *text = t+1; |
| 410 | *t = '\0'; |
| 411 | return(ret); |
| 412 | } |
| 413 | |
| 414 | char *spacedtext_get_next_token(char **text) { |
| 415 | char *t, *ret; |
| 416 | if (**text == '\0' || **text == '\n') |
| 417 | return NULL((void*)0); |
| 418 | for ( ; **text == ' ' ; (*text)++) { |
| 419 | } |
| 420 | ret = *text; |
| 421 | for (t = *text; *t != '\0' && *t != '\n' && *t != ' '; t++) { |
| 422 | } |
| 423 | if (*t == '\0' || *t == '\n') |
| 424 | *text = t; |
| 425 | else |
| 426 | *text = t+1; |
| 427 | *t = '\0'; |
| 428 | return(ret); |
| 429 | } |
| 430 | |
| 431 | struct fsm *fsm_read_spaced_text_file(char *filename) { |
| 432 | struct fsm_trie_handle *th; |
| 433 | char *text, *textorig, *insym, *outsym, *t1, *t2, *l1, *l2; |
| 434 | |
| 435 | text = textorig = file_to_mem(filename); |
| 436 | |
| 437 | if (text == NULL((void*)0)) |
| 438 | return NULL((void*)0); |
| 439 | th = fsm_trie_init(); |
| 440 | for (;;) { |
| 441 | for ( ; *text != '\0' && *text == '\n'; text++) { } |
| 442 | t1 = spacedtext_get_next_line(&text); |
| 443 | if (t1 == NULL((void*)0)) |
| 444 | break; |
| 445 | if (strlen(t1) == 0) |
| 446 | continue; |
| 447 | t2 = spacedtext_get_next_line(&text); |
| 448 | if (t2 == NULL((void*)0) || strlen(t2) == 0) { |
| 449 | for (l1 = t1; (insym = spacedtext_get_next_token(&l1)) != NULL((void*)0); ) { |
| 450 | if (strcmp(insym, "0") == 0) |
| 451 | fsm_trie_symbol(th, "@_EPSILON_SYMBOL_@", "@_EPSILON_SYMBOL_@"); |
| 452 | else if (strcmp(insym, "%0") == 0) |
| 453 | fsm_trie_symbol(th, "0", "0"); |
| 454 | else |
| 455 | fsm_trie_symbol(th, insym, insym); |
| 456 | } |
| 457 | fsm_trie_end_word(th); |
| 458 | } else { |
| 459 | for (l1 = t1, l2 = t2; ; ) { |
| 460 | insym = spacedtext_get_next_token(&l1); |
| 461 | outsym = spacedtext_get_next_token(&l2); |
| 462 | if (insym == NULL((void*)0) && outsym == NULL((void*)0)) |
| 463 | break; |
| 464 | if (insym == NULL((void*)0) || strcmp(insym, "0") == 0) |
| 465 | insym = "@_EPSILON_SYMBOL_@"; |
| 466 | if (strcmp(insym, "%0") == 0) |
| 467 | insym = "0"; |
| 468 | if (outsym == NULL((void*)0) || strcmp(outsym, "0") == 0) |
| 469 | outsym = "@_EPSILON_SYMBOL_@"; |
| 470 | if (strcmp(outsym, "%0") == 0) |
| 471 | outsym = "0"; |
| 472 | fsm_trie_symbol(th, insym, outsym); |
| 473 | } |
| 474 | fsm_trie_end_word(th); |
| 475 | } |
| 476 | } |
| 477 | free(textorig); |
| 478 | return(fsm_trie_done(th)); |
| 479 | } |
| 480 | |
| 481 | struct fsm *fsm_read_text_file(char *filename) { |
| 482 | struct fsm_trie_handle *th; |
| 483 | char *text, *textp1, *textp2; |
| 484 | int lastword; |
| 485 | |
| 486 | text = file_to_mem(filename); |
| 487 | if (text == NULL((void*)0)) { |
| 488 | return NULL((void*)0); |
| 489 | } |
| 490 | textp1 = text; |
| 491 | th = fsm_trie_init(); |
| 492 | |
| 493 | for (lastword = 0 ; lastword == 0 ; textp1 = textp2+1) { |
| 494 | for (textp2 = textp1 ; *textp2 != '\n' && *textp2 != '\0'; textp2++) { |
| 495 | } |
| 496 | if (*textp2 == '\0') { |
| 497 | lastword = 1; |
| 498 | if (textp2 == textp1) |
| 499 | break; |
| 500 | } |
| 501 | *textp2 = '\0'; |
| 502 | if (strlen(textp1) > 0) |
| 503 | fsm_trie_add_word(th, textp1); |
| 504 | } |
| 505 | free(text); |
| 506 | return(fsm_trie_done(th)); |
| 507 | } |
| 508 | |
| 509 | int fsm_write_binary_file(struct fsm *net, char *filename) { |
| 510 | gzFile outfile; |
| 511 | if ((outfile = gzopen(filename,"wb")) == NULL((void*)0)) { |
| 512 | return(1); |
| 513 | } |
| 514 | foma_net_print(net, outfile); |
| 515 | gzclose(outfile); |
| 516 | return(0); |
| 517 | } |
| 518 | |
| 519 | struct fsm *fsm_read_binary_file_multiple(fsm_read_binary_handle fsrh) { |
| 520 | char *net_name; |
| 521 | struct fsm *net; |
| 522 | struct io_buf_handle *iobh; |
| 523 | iobh = (struct io_buf_handle *) fsrh; |
| 524 | net = io_net_read(iobh, &net_name); |
| 525 | if (net == NULL((void*)0)) { |
| 526 | io_free(iobh); |
| 527 | return(NULL((void*)0)); |
| 528 | } else { |
| 529 | free(net_name); |
| 530 | return(net); |
| 531 | } |
| 532 | } |
| 533 | |
| 534 | fsm_read_binary_handle fsm_read_binary_file_multiple_init(char *filename) { |
| 535 | |
| 536 | struct io_buf_handle *iobh; |
| 537 | fsm_read_binary_handle fsm_read_handle; |
| 538 | |
| 539 | iobh = io_init(); |
| 540 | if (io_gz_file_to_mem(iobh, filename) == 0) { |
| 541 | io_free(iobh); |
| 542 | return NULL((void*)0); |
| 543 | } |
| 544 | fsm_read_handle = (void *) iobh; |
| 545 | return(fsm_read_handle); |
| 546 | } |
| 547 | |
| 548 | struct fsm *fsm_read_binary_file(char *filename) { |
| 549 | char *net_name; |
| 550 | struct fsm *net; |
| 551 | struct io_buf_handle *iobh; |
| 552 | iobh = io_init(); |
| 553 | if (io_gz_file_to_mem(iobh, filename) == 0) { |
| 554 | io_free(iobh); |
| 555 | return NULL((void*)0); |
| 556 | } |
| 557 | net = io_net_read(iobh, &net_name); |
| 558 | io_free(iobh); |
| 559 | return(net); |
| 560 | } |
| 561 | |
| 562 | int save_defined(struct defined_networks *def, char *filename) { |
| 563 | struct defined_networks *d; |
| 564 | gzFile outfile; |
| 565 | if (def == NULL((void*)0)) { |
| 566 | fprintf(stderrstderr, "No defined networks.\n"); |
| 567 | return(0); |
| 568 | } |
| 569 | if ((outfile = gzopen(filename, "wb")) == NULL((void*)0)) { |
| 570 | printf("Error opening file %s for writing.\n", filename); |
| 571 | return(-1); |
| 572 | } |
| 573 | printf("Writing definitions to file %s.\n", filename); |
| 574 | for (d = def; d != NULL((void*)0); d = d->next) { |
| 575 | if (!d->net) { |
| 576 | printf("Skipping definition without network.\n"); |
| 577 | continue; |
| 578 | } |
| 579 | strncpy(d->net->name, d->name, FSM_NAME_LEN40); |
| 580 | foma_net_print(d->net, outfile); |
| 581 | } |
| 582 | gzclose(outfile); |
| 583 | return(1); |
| 584 | } |
| 585 | |
| 586 | int load_defined(struct defined_networks *def, char *filename) { |
| 587 | struct fsm *net; |
| 588 | char *net_name; |
| 589 | struct io_buf_handle *iobh; |
| 590 | |
| 591 | iobh = io_init(); |
| 592 | printf("Loading definitions from %s.\n",filename); |
| 593 | if (io_gz_file_to_mem(iobh, filename) == 0) { |
| 594 | fprintf(stderrstderr, "File error.\n"); |
| 595 | io_free(iobh); |
| 596 | return 0; |
| 597 | } |
| 598 | while ((net = io_net_read(iobh, &net_name)) != NULL((void*)0)) { |
| 599 | add_defined(def, net, net_name); |
| 600 | } |
| 601 | io_free(iobh); |
| 602 | return(1); |
| 603 | } |
| 604 | |
| 605 | static INLINEinline int explode_line(char *buf, int *values) { |
| 606 | int i, j, items; |
| 607 | j = i = items = 0; |
Although the value stored to 'i' is used in the enclosing expression, the value is never actually read from 'i' | |
| 608 | for (;;) { |
| 609 | for (i = j; *(buf+j) != ' ' && *(buf+j) != '\0'; j++) { } |
| 610 | if (*(buf+j) == '\0') { |
| 611 | *(values+items) = atoi(buf+i); |
| 612 | items++; |
| 613 | break; |
| 614 | } else{ |
| 615 | *(buf+j) = '\0'; |
| 616 | *(values+items) = atoi(buf+i); |
| 617 | items++; |
| 618 | j++; |
| 619 | } |
| 620 | } |
| 621 | return(items); |
| 622 | } |
| 623 | |
| 624 | /* The file format we use is an extremely simple text format */ |
| 625 | /* which is gzip compressed through libz and consists of the following sections: */ |
| 626 | |
| 627 | /* ##foma-net VERSION##*/ |
| 628 | /* ##props## */ |
| 629 | /* PROPERTIES LINE */ |
| 630 | /* ##sigma## */ |
| 631 | /* ...SIGMA LINES... */ |
| 632 | /* ##states## */ |
| 633 | /* ...TRANSITION LINES... */ |
| 634 | /* ##end## */ |
| 635 | |
| 636 | /* Several networks may be concatenated in one file */ |
| 637 | |
| 638 | /* The initial identifier is "##foma-net 1.0##" */ |
| 639 | /* where 1.0 is the version number for the file format */ |
| 640 | /* followed by the line "##props##" */ |
| 641 | /* which is followed by a line of space separated integers */ |
| 642 | /* which correpond to: */ |
| 643 | |
| 644 | /* arity arccount statecount linecount finalcount pathcount is_deterministic */ |
| 645 | /* is_pruned is_minimized is_epsilon_free is_loop_free is_completed name */ |
| 646 | |
| 647 | /* where name is used if defined networks are saved/loaded */ |
| 648 | |
| 649 | /* Following the props line, we accept anything (for future expansion) */ |
| 650 | /* until we find ##sigma## */ |
| 651 | |
| 652 | /* the section beginning with "##sigma##" consists of lines with two fields: */ |
| 653 | /* number string */ |
| 654 | /* correponding to the symbol number and the symbol string */ |
| 655 | |
| 656 | /* the section beginning with "##states##" consists of lines of ASCII integers */ |
| 657 | /* with 2-5 fields to avoid some redundancy in every line corresponding to a */ |
| 658 | /* transition where otherwise state numbers would be unnecessarily repeated and */ |
| 659 | /* out symbols also (if in = out as is the case for recognizers/simple automata) */ |
| 660 | |
| 661 | /* The information depending on the number of fields in the lines is as follows: */ |
| 662 | |
| 663 | /* 2: in target (here state_no is the same as the last mentioned one and out = in) */ |
| 664 | /* 3: in out target (again, state_no is the same as the last mentioned one) */ |
| 665 | /* 4: state_no in target final_state (where out = in) */ |
| 666 | /* 5: state_no in out target final_state */ |
| 667 | |
| 668 | /* There is no harm in always using 5 fields; however this will take up more space */ |
| 669 | |
| 670 | /* As in struct fsm_state, states without transitions are represented as a 4-field: */ |
| 671 | /* state_no -1 -1 final_state (since in=out for 4-field lines, out = -1 as well) */ |
| 672 | |
| 673 | /* AS gzopen will read uncompressed files as well, one can gunzip a file */ |
| 674 | /* that contains a network and still read it */ |
| 675 | |
| 676 | struct fsm *io_net_read(struct io_buf_handle *iobh, char **net_name) { |
| 677 | |
| 678 | char buf[READ_BUF_SIZE4096]; |
| 679 | struct fsm *net; |
| 680 | struct fsm_state *fsm; |
| 681 | |
| 682 | char *new_symbol; |
| 683 | int i, items, new_symbol_number, laststate, lineint[5], *cm; |
| 684 | int extras; |
| 685 | char last_final = '1'; |
| 686 | |
| 687 | if (io_gets(iobh, buf) == 0) { |
| 688 | return NULL((void*)0); |
| 689 | } |
| 690 | |
| 691 | net = fsm_create(""); |
| 692 | |
| 693 | if (strcmp(buf, "##foma-net 1.0##") != 0) { |
| 694 | fsm_destroy(net); |
| 695 | perror("File format error foma!\n"); |
| 696 | return NULL((void*)0); |
| 697 | } |
| 698 | io_gets(iobh, buf); |
| 699 | if (strcmp(buf, "##props##") != 0) { |
| 700 | perror("File format error props!\n"); |
| 701 | fsm_destroy(net); |
| 702 | return NULL((void*)0); |
| 703 | } |
| 704 | /* Properties */ |
| 705 | io_gets(iobh, buf); |
| 706 | extras = 0; |
| 707 | sscanf(buf, "%i %i %i %i %i %lld %i %i %i %i %i %i %s", &net->arity, &net->arccount, &net->statecount, &net->linecount, &net->finalcount, &net->pathcount, &net->is_deterministic, &net->is_pruned, &net->is_minimized, &net->is_epsilon_free, &net->is_loop_free, &extras, buf); |
| 708 | strncpy(net->name, buf, FSM_NAME_LEN40); |
| 709 | *net_name = strdup(buf); |
| 710 | io_gets(iobh, buf); |
| 711 | |
| 712 | net->is_completed = (extras & 3); |
| 713 | net->arcs_sorted_in = (extras & 12) >> 2; |
| 714 | net->arcs_sorted_out = (extras & 48) >> 4; |
| 715 | |
| 716 | /* Sigma */ |
| 717 | while (strcmp(buf, "##sigma##") != 0) { /* Loop until we encounter ##sigma## */ |
| 718 | if (buf[0] == '\0') { |
| 719 | printf("File format error at sigma definition!\n"); |
| 720 | fsm_destroy(net); |
| 721 | return NULL((void*)0); |
| 722 | } |
| 723 | io_gets(iobh, buf); |
| 724 | } |
| 725 | |
| 726 | for (;;) { |
| 727 | io_gets(iobh, buf); |
| 728 | if (buf[0] == '#') break; |
| 729 | if (buf[0] == '\0') continue; |
| 730 | new_symbol = strstr(buf, " "); |
| 731 | new_symbol[0] = '\0'; |
| 732 | new_symbol++; |
| 733 | if (new_symbol[0] == '\0') { |
| 734 | sscanf(buf,"%i", &new_symbol_number); |
| 735 | sigma_add_number(net->sigma, "\n", new_symbol_number); |
| 736 | } else { |
| 737 | sscanf(buf,"%i", &new_symbol_number); |
| 738 | sigma_add_number(net->sigma, new_symbol, new_symbol_number); |
| 739 | } |
| 740 | } |
| 741 | |
| 742 | /* States */ |
| 743 | if (strcmp(buf, "##states##") != 0) { |
| 744 | printf("File format error!\n"); |
| 745 | return NULL((void*)0); |
| 746 | } |
| 747 | net->states = malloc(net->linecount*sizeof(struct fsm_state)); |
| 748 | fsm = net->states; |
| 749 | laststate = -1; |
| 750 | for (i=0; ;i++) { |
| 751 | io_gets(iobh, buf); |
| 752 | if (buf[0] == '#') break; |
| 753 | |
| 754 | /* scanf is just too slow here */ |
| 755 | |
| 756 | //items = sscanf(buf, "%i %i %i %i %i",&lineint[0], &lineint[1], &lineint[2], &lineint[3], &lineint[4]); |
| 757 | |
| 758 | items = explode_line(buf, &lineint[0]); |
| 759 | |
| 760 | switch (items) { |
| 761 | case 2: |
| 762 | (fsm+i)->state_no = laststate; |
| 763 | (fsm+i)->in = lineint[0]; |
| 764 | (fsm+i)->out = lineint[0]; |
| 765 | (fsm+i)->target = lineint[1]; |
| 766 | (fsm+i)->final_state = last_final; |
| 767 | break; |
| 768 | case 3: |
| 769 | (fsm+i)->state_no = laststate; |
| 770 | (fsm+i)->in = lineint[0]; |
| 771 | (fsm+i)->out = lineint[1]; |
| 772 | (fsm+i)->target = lineint[2]; |
| 773 | (fsm+i)->final_state = last_final; |
| 774 | break; |
| 775 | case 4: |
| 776 | (fsm+i)->state_no = lineint[0]; |
| 777 | (fsm+i)->in = lineint[1]; |
| 778 | (fsm+i)->out = lineint[1]; |
| 779 | (fsm+i)->target = lineint[2]; |
| 780 | (fsm+i)->final_state = lineint[3]; |
| 781 | laststate = lineint[0]; |
| 782 | last_final = lineint[3]; |
| 783 | break; |
| 784 | case 5: |
| 785 | (fsm+i)->state_no = lineint[0]; |
| 786 | (fsm+i)->in = lineint[1]; |
| 787 | (fsm+i)->out = lineint[2]; |
| 788 | (fsm+i)->target = lineint[3]; |
| 789 | (fsm+i)->final_state = lineint[4]; |
| 790 | laststate = lineint[0]; |
| 791 | last_final = lineint[4]; |
| 792 | break; |
| 793 | default: |
| 794 | printf("File format error\n"); |
| 795 | return NULL((void*)0); |
| 796 | } |
| 797 | if (laststate > 0) { |
| 798 | (fsm+i)->start_state = 0; |
| 799 | } else if (laststate == -1) { |
| 800 | (fsm+i)->start_state = -1; |
| 801 | } else { |
| 802 | (fsm+i)->start_state = 1; |
| 803 | } |
| 804 | |
| 805 | } |
| 806 | if (strcmp(buf, "##cmatrix##") == 0) { |
| 807 | cmatrix_init(net); |
| 808 | cm = net->medlookup->confusion_matrix; |
| 809 | for (;;) { |
| 810 | io_gets(iobh, buf); |
| 811 | if (buf[0] == '#') break; |
| 812 | sscanf(buf,"%i", &i); |
| 813 | *cm = i; |
| 814 | cm++; |
| 815 | } |
| 816 | } |
| 817 | if (strcmp(buf, "##end##") != 0) { |
| 818 | printf("File format error!\n"); |
| 819 | return NULL((void*)0); |
| 820 | } |
| 821 | return(net); |
| 822 | } |
| 823 | |
| 824 | static int io_gets(struct io_buf_handle *iobh, char *target) { |
| 825 | int i; |
| 826 | for (i = 0; *((iobh->io_buf_ptr)+i) != '\n' && *((iobh->io_buf_ptr)+i) != '\0'; i++) { |
| 827 | *(target+i) = *((iobh->io_buf_ptr)+i); |
| 828 | } |
| 829 | *(target+i) = '\0'; |
| 830 | if (*((iobh->io_buf_ptr)+i) == '\0') |
| 831 | (iobh->io_buf_ptr) = (iobh->io_buf_ptr) + i; |
| 832 | else |
| 833 | (iobh->io_buf_ptr) = (iobh->io_buf_ptr) + i + 1; |
| 834 | |
| 835 | return(i); |
| 836 | } |
| 837 | |
| 838 | int foma_net_print(struct fsm *net, gzFile outfile) { |
| 839 | struct sigma *sigma; |
| 840 | struct fsm_state *fsm; |
| 841 | int i, maxsigma, laststate, *cm, extras; |
| 842 | |
| 843 | /* Header */ |
| 844 | gzprintf(outfile, "%s","##foma-net 1.0##\n"); |
| 845 | |
| 846 | /* Properties */ |
| 847 | gzprintf(outfile, "%s","##props##\n"); |
| 848 | |
| 849 | extras = (net->is_completed) | (net->arcs_sorted_in << 2) | (net->arcs_sorted_out << 4); |
| 850 | |
| 851 | gzprintf(outfile, |
| 852 | "%i %i %i %i %i %lld %i %i %i %i %i %i %s\n", net->arity, net->arccount, net->statecount, net->linecount, net->finalcount, net->pathcount, net->is_deterministic, net->is_pruned, net->is_minimized, net->is_epsilon_free, net->is_loop_free, extras, net->name); |
| 853 | |
| 854 | /* Sigma */ |
| 855 | gzprintf(outfile, "%s","##sigma##\n"); |
| 856 | for (sigma = net->sigma; sigma != NULL((void*)0) && sigma->number != -1; sigma = sigma->next) { |
| 857 | gzprintf(outfile, "%i %s\n",sigma->number, sigma->symbol); |
| 858 | } |
| 859 | |
| 860 | /* State array */ |
| 861 | laststate = -1; |
| 862 | gzprintf(outfile, "%s","##states##\n"); |
| 863 | for (fsm = net->states; fsm->state_no !=-1; fsm++) { |
| 864 | if (fsm->state_no != laststate) { |
| 865 | if (fsm->in != fsm->out) { |
| 866 | gzprintf(outfile, "%i %i %i %i %i\n",fsm->state_no, fsm->in, fsm->out, fsm->target, fsm->final_state); |
| 867 | } else { |
| 868 | gzprintf(outfile, "%i %i %i %i\n",fsm->state_no, fsm->in, fsm->target, fsm->final_state); |
| 869 | } |
| 870 | } else { |
| 871 | if (fsm->in != fsm->out) { |
| 872 | gzprintf(outfile, "%i %i %i\n", fsm->in, fsm->out, fsm->target); |
| 873 | } else { |
| 874 | gzprintf(outfile, "%i %i\n", fsm->in, fsm->target); |
| 875 | } |
| 876 | } |
| 877 | laststate = fsm->state_no; |
| 878 | } |
| 879 | /* Sentinel for states */ |
| 880 | gzprintf(outfile, "-1 -1 -1 -1 -1\n"); |
| 881 | |
| 882 | /* Store confusion matrix */ |
| 883 | if (net->medlookup != NULL((void*)0) && net->medlookup->confusion_matrix != NULL((void*)0)) { |
| 884 | |
| 885 | gzprintf(outfile, "%s","##cmatrix##\n"); |
| 886 | cm = net->medlookup->confusion_matrix; |
| 887 | maxsigma = sigma_max(net->sigma)+1; |
| 888 | for (i=0; i < maxsigma*maxsigma; i++) { |
| 889 | gzprintf(outfile, "%i\n", *(cm+i)); |
| 890 | } |
| 891 | } |
| 892 | |
| 893 | /* End */ |
| 894 | gzprintf(outfile, "%s","##end##\n"); |
| 895 | return(1); |
| 896 | } |
| 897 | |
| 898 | int net_print_att(struct fsm *net, FILE *outfile) { |
| 899 | struct fsm_state *fsm; |
| 900 | struct fsm_sigma_list *sl; |
| 901 | int i, prev; |
| 902 | |
| 903 | fsm = net->states; |
| 904 | sl = sigma_to_list(net->sigma); |
| 905 | if (sigma_max(net->sigma) >= 0) { |
| 906 | (sl+0)->symbol = g_att_epsilon; |
| 907 | } |
| 908 | for (i=0; (fsm+i)->state_no != -1; i++) { |
| 909 | if ((fsm+i)->target != -1) { |
| 910 | fprintf(outfile, "%i\t%i\t%s\t%s\n",(fsm+i)->state_no,(fsm+i)->target, (sl+(fsm+i)->in)->symbol, (sl+(fsm+i)->out)->symbol); |
| 911 | } |
| 912 | } |
| 913 | prev = -1; |
| 914 | for (i=0; (fsm+i)->state_no != -1; prev = (fsm+i)->state_no, i++) { |
| 915 | if ((fsm+i)->state_no != prev && (fsm+i)->final_state == 1) { |
| 916 | fprintf(outfile, "%i\n",(fsm+i)->state_no); |
| 917 | } |
| 918 | } |
| 919 | free(sl); |
| 920 | return(1); |
| 921 | } |
| 922 | |
| 923 | static size_t io_get_gz_file_size(char *filename) { |
| 924 | |
| 925 | FILE *infile; |
| 926 | size_t numbytes; |
| 927 | unsigned char bytes[4]; |
| 928 | unsigned int ints[4], i; |
| 929 | |
| 930 | /* The last four bytes in a .gz file shows the size of the uncompressed data */ |
| 931 | infile = fopen(filename, "r"); |
| 932 | fseek(infile, -4, SEEK_END2); |
| 933 | fread(&bytes, 1, 4, infile); |
| 934 | fclose(infile); |
| 935 | for (i = 0 ; i < 4 ; i++) { |
| 936 | ints[i] = bytes[i]; |
| 937 | } |
| 938 | numbytes = ints[0] | (ints[1] << 8) | (ints[2] << 16 ) | (ints[3] << 24); |
| 939 | return(numbytes); |
| 940 | } |
| 941 | |
| 942 | static size_t io_get_regular_file_size(char *filename) { |
| 943 | |
| 944 | FILE *infile; |
| 945 | size_t numbytes; |
| 946 | |
| 947 | infile = fopen(filename, "r"); |
| 948 | fseek(infile, 0L, SEEK_END2); |
| 949 | numbytes = ftell(infile); |
| 950 | fclose(infile); |
| 951 | return(numbytes); |
| 952 | } |
| 953 | |
| 954 | |
| 955 | static size_t io_get_file_size(char *filename) { |
| 956 | gzFile FILE; |
| 957 | size_t size; |
| 958 | FILE = gzopen(filename, "r"); |
| 959 | if (FILE == NULL((void*)0)) { |
| 960 | return(0); |
| 961 | } |
| 962 | if (gzdirect(FILE) == 1) { |
| 963 | gzclose(FILE); |
| 964 | size = io_get_regular_file_size(filename); |
| 965 | } else { |
| 966 | gzclose(FILE); |
| 967 | size = io_get_gz_file_size(filename); |
| 968 | } |
| 969 | return(size); |
| 970 | } |
| 971 | |
| 972 | size_t io_gz_file_to_mem(struct io_buf_handle *iobh, char *filename) { |
| 973 | |
| 974 | size_t size; |
| 975 | gzFile FILE; |
| 976 | |
| 977 | size = io_get_file_size(filename); |
| 978 | if (size == 0) { |
| 979 | return 0; |
| 980 | } |
| 981 | (iobh->io_buf) = malloc((size+1)*sizeof(char)); |
| 982 | FILE = gzopen(filename, "rb"); |
| 983 | gzread(FILE, iobh->io_buf, size); |
| 984 | gzclose(FILE); |
| 985 | *((iobh->io_buf)+size) = '\0'; |
| 986 | iobh->io_buf_ptr = iobh->io_buf; |
| 987 | return(size); |
| 988 | } |
| 989 | |
| 990 | typedef struct BOM { |
| 991 | char code[4]; |
| 992 | int len; |
| 993 | char* name; |
| 994 | } BOM; |
| 995 | |
| 996 | static BOM BOM_codes[] = { |
| 997 | { { 0xEF, 0xBB, 0xBF }, 3, "UTF-8"}, |
| 998 | { { 0xFF, 0xFE, 0x00, 0x00 }, 4, "UTF-32LE" }, |
| 999 | { { 0x00, 0x00, 0xFE, 0xFF }, 4, "UTF-32BE" }, |
| 1000 | { { 0xFF, 0xFE }, 2, "UTF16-LE" }, |
| 1001 | { { 0xFE, 0xFF }, 2, "UTF16-BE" }, |
| 1002 | { { 0, } , 0, NULL((void*)0) }, |
| 1003 | }; |
| 1004 | |
| 1005 | BOM *check_BOM(char *buffer) { |
| 1006 | BOM *bom; |
| 1007 | for(bom = BOM_codes; bom->len; bom++) { |
| 1008 | if(strncmp(bom->code, buffer, bom->len) == 0) { |
| 1009 | return bom; |
| 1010 | } |
| 1011 | } |
| 1012 | return NULL((void*)0); |
| 1013 | } |
| 1014 | |
| 1015 | char *file_to_mem(char *name) { |
| 1016 | FILE *infile; |
| 1017 | size_t numbytes; |
| 1018 | char *buffer; |
| 1019 | BOM *bom; |
| 1020 | infile = fopen(name, "r"); |
| 1021 | if(infile == NULL((void*)0)) { |
| 1022 | printf("Error opening file '%s'\n",name); |
| 1023 | return NULL((void*)0); |
| 1024 | } |
| 1025 | fseek(infile, 0L, SEEK_END2); |
| 1026 | numbytes = ftell(infile); |
| 1027 | fseek(infile, 0L, SEEK_SET0); |
| 1028 | buffer = (char*)malloc((numbytes+1) * sizeof(char)); |
| 1029 | if(buffer == NULL((void*)0)) { |
| 1030 | printf("Error reading file '%s'\n",name); |
| 1031 | return NULL((void*)0); |
| 1032 | } |
| 1033 | if (fread(buffer, sizeof(char), numbytes, infile) != numbytes) { |
| 1034 | printf("Error reading file '%s'\n",name); |
| 1035 | return NULL((void*)0); |
| 1036 | } |
| 1037 | |
| 1038 | bom = check_BOM(buffer); |
| 1039 | if (bom != NULL((void*)0)) { |
| 1040 | printf("%s BOM mark is detected in file '%s'.\n",bom->name,name); |
| 1041 | return NULL((void*)0); |
| 1042 | } |
| 1043 | fclose(infile); |
| 1044 | *(buffer+numbytes)='\0'; |
| 1045 | return(buffer); |
| 1046 | } |