File: | io.c |
Warning: | line 607, column 9 Although the value stored to 'i' is used in the enclosing expression, the value is never actually read from 'i' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* Foma: a finite-state toolkit and library. */ |
2 | /* Copyright © 2008-2021 Mans Hulden */ |
3 | |
4 | /* This file is part of foma. */ |
5 | |
6 | /* Licensed under the Apache License, Version 2.0 (the "License"); */ |
7 | /* you may not use this file except in compliance with the License. */ |
8 | /* You may obtain a copy of the License at */ |
9 | |
10 | /* http://www.apache.org/licenses/LICENSE-2.0 */ |
11 | |
12 | /* Unless required by applicable law or agreed to in writing, software */ |
13 | /* distributed under the License is distributed on an "AS IS" BASIS, */ |
14 | /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ |
15 | /* See the License for the specific language governing permissions and */ |
16 | /* limitations under the License. */ |
17 | |
18 | #include <stdio.h> |
19 | #include <string.h> |
20 | #include <stdlib.h> |
21 | #include <stdarg.h> |
22 | #include "foma.h" |
23 | #include "zlib.h" |
24 | |
25 | #define TYPE_TRANSITION1 1 |
26 | #define TYPE_SYMBOL2 2 |
27 | #define TYPE_FINAL3 3 |
28 | #define TYPE_PROPERTY4 4 |
29 | #define TYPE_END5 5 |
30 | #define TYPE_ERROR6 6 |
31 | |
32 | #define READ_BUF_SIZE4096 4096 |
33 | |
34 | struct binaryline { |
35 | int type; |
36 | int state; |
37 | int in; |
38 | int target; |
39 | int out; |
40 | int symbol; |
41 | char *name; |
42 | char *value; |
43 | }; |
44 | |
45 | extern char *g_att_epsilon; |
46 | |
47 | struct io_buf_handle { |
48 | char *io_buf; |
49 | char *io_buf_ptr; |
50 | }; |
51 | |
52 | struct io_buf_handle *io_init(); |
53 | void io_free(struct io_buf_handle *iobh); |
54 | static int io_gets(struct io_buf_handle *iobh, char *target); |
55 | static size_t io_get_gz_file_size(char *filename); |
56 | static size_t io_get_file_size(char *filename); |
57 | static size_t io_get_regular_file_size(char *filename); |
58 | size_t io_gz_file_to_mem (struct io_buf_handle *iobh, char *filename); |
59 | int foma_net_print(struct fsm *net, gzFile outfile); |
60 | struct fsm *io_net_read(struct io_buf_handle *iobh, char **net_name); |
61 | static INLINEinline int explode_line (char *buf, int *values); |
62 | |
63 | |
64 | void escape_print(FILE *stream, char* string) { |
65 | int i; |
66 | if (strchr(string, '"') != NULL((void*)0)) { |
67 | for (i = 0; *(string+i) != '\0'; i++) { |
68 | if (*(string+i) == '"') { |
69 | fprintf(stream, "\\\""); |
70 | } else { |
71 | fputc(*(string+i), stream); |
72 | } |
73 | } |
74 | } else { |
75 | fprintf(stream, "%s", string); |
76 | } |
77 | } |
78 | |
79 | int foma_write_prolog (struct fsm *net, char *filename) { |
80 | struct fsm_state *stateptr; |
81 | int i, *finals, *used_symbols, maxsigma; |
82 | FILE *out; |
83 | char *outstring, *instring, identifier[100]; |
84 | |
85 | if (filename == NULL((void*)0)) { |
86 | out = stdoutstdout; |
87 | } else { |
88 | if ((out = fopen(filename, "w")) == NULL((void*)0)) { |
89 | printf("Error writing to file '%s'. Using stdout.\n", filename); |
90 | out = stdoutstdout; |
91 | } |
92 | printf("Writing prolog to file '%s'.\n", filename); |
93 | } |
94 | fsm_count(net); |
95 | maxsigma = sigma_max(net->sigma); |
96 | used_symbols = calloc(maxsigma+1,sizeof(int)); |
97 | finals = malloc(sizeof(int)*(net->statecount)); |
98 | stateptr = net->states; |
99 | identifier[0] = '\0'; |
100 | |
101 | strcpy(identifier, net->name); |
102 | |
103 | /* Print identifier */ |
104 | fprintf(out, "%s%s%s", "network(",identifier,").\n"); |
105 | |
106 | for (i=0; (stateptr+i)->state_no != -1; i++) { |
107 | if ((stateptr+i)->final_state == 1) { |
108 | *(finals+((stateptr+i)->state_no)) = 1; |
109 | } else { |
110 | *(finals+((stateptr+i)->state_no)) = 0; |
111 | } |
112 | if ((stateptr+i)->in != -1) { |
113 | *(used_symbols+((stateptr+i)->in)) = 1; |
114 | } |
115 | if ((stateptr+i)->out != -1) { |
116 | *(used_symbols+((stateptr+i)->out)) = 1; |
117 | } |
118 | |
119 | } |
120 | |
121 | for (i = 3; i <= maxsigma; i++) { |
122 | if (*(used_symbols+i) == 0) { |
123 | instring = sigma_string(i, net->sigma); |
124 | if (strcmp(instring,"0") == 0) { |
125 | instring = "%0"; |
126 | } |
127 | fprintf(out, "symbol(%s, \"", identifier); |
128 | escape_print(out, instring); |
129 | fprintf(out, "\").\n"); |
130 | |
131 | } |
132 | } |
133 | |
134 | for (; stateptr->state_no != -1; stateptr++) { |
135 | if (stateptr->target == -1) |
136 | continue; |
137 | fprintf(out, "arc(%s, %i, %i, ", identifier, stateptr->state_no, stateptr->target); |
138 | if (stateptr->in == 0) instring = "0"; |
139 | else if (stateptr->in == 1) instring = "?"; |
140 | else if (stateptr->in == 2) instring = "?"; |
141 | else instring = sigma_string(stateptr->in, net->sigma); |
142 | if (stateptr->out == 0) outstring = "0"; |
143 | else if (stateptr->out == 1) outstring = "?"; |
144 | else if (stateptr->out == 2) outstring = "?"; |
145 | else outstring = sigma_string(stateptr->out, net->sigma); |
146 | |
147 | if (strcmp(instring,"0") == 0 && stateptr->in != 0) instring = "%0"; |
148 | if (strcmp(outstring,"0") == 0 && stateptr->out != 0) outstring = "%0"; |
149 | if (strcmp(instring,"?") == 0 && stateptr->in > 2) instring = "%?"; |
150 | if (strcmp(outstring,"?") == 0 && stateptr->in > 2) outstring = "%?"; |
151 | /* Escape quotes */ |
152 | |
153 | if (net->arity == 2 && stateptr->in == IDENTITY2 && stateptr->out == IDENTITY2) { |
154 | fprintf(out, "\"?\").\n"); |
155 | } |
156 | else if (net->arity == 2 && stateptr->in == stateptr->out && stateptr->in != UNKNOWN1) { |
157 | fprintf(out, "\""); |
158 | escape_print(out, instring); |
159 | fprintf(out, "\").\n"); |
160 | } |
161 | else if (net->arity == 2) { |
162 | fprintf(out, "\""); |
163 | escape_print(out, instring); |
164 | fprintf(out, "\":\""); |
165 | escape_print(out, outstring); |
166 | fprintf(out, "\").\n"); |
167 | } |
168 | else if (net->arity == 1) { |
169 | fprintf(out, "\""); |
170 | escape_print(out, instring); |
171 | fprintf(out, "\").\n"); |
172 | } |
173 | } |
174 | |
175 | for (i = 0; i < net->statecount; i++) { |
176 | if (*(finals+i)) { |
177 | fprintf(out, "final(%s, %i).\n", identifier, i); |
178 | } |
179 | } |
180 | if (filename != NULL((void*)0)) { |
181 | fclose(out); |
182 | } |
183 | free(finals); |
184 | free(used_symbols); |
185 | return 1; |
186 | } |
187 | |
188 | struct fsm *read_att(char *filename) { |
189 | |
190 | struct fsm_construct_handle *h; |
191 | struct fsm *net; |
192 | int i; |
193 | char inword[1024], delimiters[] = "\t", *tokens[6]; |
194 | FILE *INFILE; |
195 | |
196 | INFILE = fopen(filename, "r"); |
197 | if (INFILE == NULL((void*)0)) { |
198 | return(NULL((void*)0)); |
199 | } |
200 | |
201 | h = fsm_construct_init(filename); |
202 | while (fgets(inword, 1024, INFILE) != NULL((void*)0)) { |
203 | if (inword[strlen(inword)-1] == '\n') { |
204 | inword[strlen(inword)-1] = '\0'; |
205 | } |
206 | tokens[0] = strtok(inword, delimiters); |
207 | i = 0; |
208 | if (tokens[0] != NULL((void*)0)) { |
209 | i = 1; |
210 | for ( ; ; ) { |
211 | tokens[i] = strtok(NULL((void*)0), delimiters); |
212 | if (tokens[i] == NULL((void*)0)) { |
213 | break; |
214 | } |
215 | i++; |
216 | if (i == 6) |
217 | break; |
218 | } |
219 | } |
220 | if (i == 0) { continue; } |
221 | if (i >= 4) { |
222 | if (strcmp(tokens[2],g_att_epsilon) == 0) |
223 | tokens[2] = "@_EPSILON_SYMBOL_@"; |
224 | if (strcmp(tokens[3],g_att_epsilon) == 0) |
225 | tokens[3] = "@_EPSILON_SYMBOL_@"; |
226 | |
227 | fsm_construct_add_arc(h, atoi(tokens[0]), atoi(tokens[1]), tokens[2], tokens[3]); |
228 | } |
229 | else if (i <= 3 && i > 0) { |
230 | fsm_construct_set_final(h,atoi(tokens[0])); |
231 | } |
232 | } |
233 | fsm_construct_set_initial(h,0); |
234 | fclose(INFILE); |
235 | net = fsm_construct_done(h); |
236 | fsm_count(net); |
237 | net = fsm_topsort(net); |
238 | return(net); |
239 | } |
240 | |
241 | struct fsm *fsm_read_prolog (char *filename) { |
242 | char buf [1024], temp [1024], in [128], out[128], *temp_ptr, *temp_ptr2; |
243 | int arity, source, target, has_net; |
244 | struct fsm *outnet; |
245 | struct fsm_construct_handle *outh = NULL((void*)0); |
246 | FILE *prolog_file; |
247 | |
248 | has_net = 0; |
249 | prolog_file = fopen(filename, "r"); |
250 | if (prolog_file == NULL((void*)0)) { |
251 | return NULL((void*)0); |
252 | } |
253 | |
254 | while (fgets(buf, 1023, prolog_file) != NULL((void*)0)) { |
255 | if (strstr(buf, "network(") == buf) { |
256 | /* Extract network name */ |
257 | if (has_net == 1) { |
258 | perror("WARNING: prolog file contains multiple nets. Only returning the first one.\n"); |
259 | break; |
260 | } else { |
261 | has_net = 1; |
262 | } |
263 | temp_ptr = strstr(buf, "network(")+8; |
264 | temp_ptr2 = strstr(buf, ")."); |
265 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
266 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
267 | |
268 | /* Start network */ |
269 | outh = fsm_construct_init(temp); |
270 | } |
271 | if (strstr(buf, "final(") == buf) { |
272 | temp_ptr = strstr(buf, " "); |
273 | temp_ptr++; |
274 | temp_ptr2 = strstr(temp_ptr, ")."); |
275 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
276 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
277 | |
278 | fsm_construct_set_final(outh, atoi(temp)); |
279 | } |
280 | if (strstr(buf, "symbol(") == buf) { |
281 | temp_ptr = strstr(buf, ", \"")+3; |
282 | temp_ptr2 = strstr(temp_ptr, "\")."); |
283 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
284 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
285 | if (strcmp(temp, "%0") == 0) |
286 | strcpy(temp, "0"); |
287 | //printf("special: %s\n",temp); |
288 | |
289 | if (fsm_construct_check_symbol(outh, temp) == -1) { |
290 | fsm_construct_add_symbol(outh, temp); |
291 | } |
292 | continue; |
293 | } |
294 | if (strstr(buf, "arc(") == buf) { |
295 | in[0] = '\0'; |
296 | out[0] = '\0'; |
297 | |
298 | if (strstr(buf, "\":\"") == NULL((void*)0) || strstr(buf, ", \":\").") != NULL((void*)0)) { |
299 | arity = 1; |
300 | } else { |
301 | arity = 2; |
302 | } |
303 | |
304 | /* Get source */ |
305 | temp_ptr = strstr(buf, " "); |
306 | temp_ptr++; |
307 | temp_ptr2 = strstr(temp_ptr, ","); |
308 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
309 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
310 | source = atoi(temp); |
311 | |
312 | /* Get target */ |
313 | temp_ptr = strstr(temp_ptr2, " "); |
314 | temp_ptr++; |
315 | temp_ptr2 = strstr(temp_ptr, ","); |
316 | strncpy(temp, temp_ptr, (temp_ptr2 - temp_ptr)); |
317 | temp[(temp_ptr2-temp_ptr)] = '\0'; |
318 | target = atoi(temp); |
319 | |
320 | temp_ptr = strstr(temp_ptr2, "\""); |
321 | temp_ptr++; |
322 | if (arity == 2) { |
323 | temp_ptr2 = strstr(temp_ptr, "\":"); |
324 | } else { |
325 | temp_ptr2 = strstr(temp_ptr, "\")."); |
326 | } |
327 | strncpy(in, temp_ptr, (temp_ptr2 - temp_ptr)); |
328 | in[(temp_ptr2 - temp_ptr)] = '\0'; |
329 | |
330 | if (arity == 2) { |
331 | temp_ptr = strstr(temp_ptr2, ":\""); |
332 | temp_ptr += 2; |
333 | temp_ptr2 = strstr(temp_ptr, "\")."); |
334 | strncpy(out, temp_ptr, (temp_ptr2 - temp_ptr)); |
335 | out[(temp_ptr2 - temp_ptr)] = '\0'; |
336 | } |
337 | if (arity == 1 && (strcmp(in, "?") == 0)) { |
338 | strcpy(in,"@_IDENTITY_SYMBOL_@"); |
339 | } |
340 | if (arity == 2 && (strcmp(in, "?") == 0)) { |
341 | strcpy(in,"@_UNKNOWN_SYMBOL_@"); |
342 | } |
343 | if (arity == 2 && (strcmp(out, "?") == 0)) { |
344 | strcpy(out,"@_UNKNOWN_SYMBOL_@"); |
345 | } |
346 | if (strcmp(in, "0") == 0) { |
347 | strcpy(in,"@_EPSILON_SYMBOL_@"); |
348 | } |
349 | if (strcmp(out, "0") == 0) { |
350 | strcpy(out,"@_EPSILON_SYMBOL_@"); |
351 | } |
352 | if (strcmp(in, "%0") == 0) { |
353 | strcpy(in,"0"); |
354 | } |
355 | if (strcmp(out, "%0") == 0) { |
356 | strcpy(out,"0"); |
357 | } |
358 | if (strcmp(in, "%?") == 0) { |
359 | strcpy(in,"?"); |
360 | } |
361 | if (strcmp(out, "%?") == 0) { |
362 | strcpy(out,"?"); |
363 | } |
364 | |
365 | if (arity == 1) { |
366 | fsm_construct_add_arc(outh, source, target, in, in); |
367 | } else { |
368 | fsm_construct_add_arc(outh, source, target, in, out); |
369 | } |
370 | } |
371 | } |
372 | fclose(prolog_file); |
373 | if (has_net == 1) { |
374 | fsm_construct_set_initial(outh, 0); |
375 | outnet = fsm_construct_done(outh); |
376 | fsm_topsort(outnet); |
377 | return(outnet); |
378 | } else { |
379 | return(NULL((void*)0)); |
380 | } |
381 | } |
382 | |
383 | struct io_buf_handle *io_init() { |
384 | struct io_buf_handle *iobh; |
385 | iobh = malloc(sizeof(struct io_buf_handle)); |
386 | (iobh->io_buf) = NULL((void*)0); |
387 | (iobh->io_buf_ptr) = NULL((void*)0); |
388 | return(iobh); |
389 | } |
390 | |
391 | void io_free(struct io_buf_handle *iobh) { |
392 | if (iobh->io_buf != NULL((void*)0)) { |
393 | free(iobh->io_buf); |
394 | (iobh->io_buf) = NULL((void*)0); |
395 | } |
396 | free(iobh); |
397 | } |
398 | |
399 | char *spacedtext_get_next_line(char **text) { |
400 | char *t, *ret; |
401 | ret = *text; |
402 | if (**text == '\0') |
403 | return NULL((void*)0); |
404 | for (t = *text; *t != '\0' && *t != '\n'; t++) { |
405 | } |
406 | if (*t == '\0') |
407 | *text = t; |
408 | else |
409 | *text = t+1; |
410 | *t = '\0'; |
411 | return(ret); |
412 | } |
413 | |
414 | char *spacedtext_get_next_token(char **text) { |
415 | char *t, *ret; |
416 | if (**text == '\0' || **text == '\n') |
417 | return NULL((void*)0); |
418 | for ( ; **text == ' ' ; (*text)++) { |
419 | } |
420 | ret = *text; |
421 | for (t = *text; *t != '\0' && *t != '\n' && *t != ' '; t++) { |
422 | } |
423 | if (*t == '\0' || *t == '\n') |
424 | *text = t; |
425 | else |
426 | *text = t+1; |
427 | *t = '\0'; |
428 | return(ret); |
429 | } |
430 | |
431 | struct fsm *fsm_read_spaced_text_file(char *filename) { |
432 | struct fsm_trie_handle *th; |
433 | char *text, *textorig, *insym, *outsym, *t1, *t2, *l1, *l2; |
434 | |
435 | text = textorig = file_to_mem(filename); |
436 | |
437 | if (text == NULL((void*)0)) |
438 | return NULL((void*)0); |
439 | th = fsm_trie_init(); |
440 | for (;;) { |
441 | for ( ; *text != '\0' && *text == '\n'; text++) { } |
442 | t1 = spacedtext_get_next_line(&text); |
443 | if (t1 == NULL((void*)0)) |
444 | break; |
445 | if (strlen(t1) == 0) |
446 | continue; |
447 | t2 = spacedtext_get_next_line(&text); |
448 | if (t2 == NULL((void*)0) || strlen(t2) == 0) { |
449 | for (l1 = t1; (insym = spacedtext_get_next_token(&l1)) != NULL((void*)0); ) { |
450 | if (strcmp(insym, "0") == 0) |
451 | fsm_trie_symbol(th, "@_EPSILON_SYMBOL_@", "@_EPSILON_SYMBOL_@"); |
452 | else if (strcmp(insym, "%0") == 0) |
453 | fsm_trie_symbol(th, "0", "0"); |
454 | else |
455 | fsm_trie_symbol(th, insym, insym); |
456 | } |
457 | fsm_trie_end_word(th); |
458 | } else { |
459 | for (l1 = t1, l2 = t2; ; ) { |
460 | insym = spacedtext_get_next_token(&l1); |
461 | outsym = spacedtext_get_next_token(&l2); |
462 | if (insym == NULL((void*)0) && outsym == NULL((void*)0)) |
463 | break; |
464 | if (insym == NULL((void*)0) || strcmp(insym, "0") == 0) |
465 | insym = "@_EPSILON_SYMBOL_@"; |
466 | if (strcmp(insym, "%0") == 0) |
467 | insym = "0"; |
468 | if (outsym == NULL((void*)0) || strcmp(outsym, "0") == 0) |
469 | outsym = "@_EPSILON_SYMBOL_@"; |
470 | if (strcmp(outsym, "%0") == 0) |
471 | outsym = "0"; |
472 | fsm_trie_symbol(th, insym, outsym); |
473 | } |
474 | fsm_trie_end_word(th); |
475 | } |
476 | } |
477 | free(textorig); |
478 | return(fsm_trie_done(th)); |
479 | } |
480 | |
481 | struct fsm *fsm_read_text_file(char *filename) { |
482 | struct fsm_trie_handle *th; |
483 | char *text, *textp1, *textp2; |
484 | int lastword; |
485 | |
486 | text = file_to_mem(filename); |
487 | if (text == NULL((void*)0)) { |
488 | return NULL((void*)0); |
489 | } |
490 | textp1 = text; |
491 | th = fsm_trie_init(); |
492 | |
493 | for (lastword = 0 ; lastword == 0 ; textp1 = textp2+1) { |
494 | for (textp2 = textp1 ; *textp2 != '\n' && *textp2 != '\0'; textp2++) { |
495 | } |
496 | if (*textp2 == '\0') { |
497 | lastword = 1; |
498 | if (textp2 == textp1) |
499 | break; |
500 | } |
501 | *textp2 = '\0'; |
502 | if (strlen(textp1) > 0) |
503 | fsm_trie_add_word(th, textp1); |
504 | } |
505 | free(text); |
506 | return(fsm_trie_done(th)); |
507 | } |
508 | |
509 | int fsm_write_binary_file(struct fsm *net, char *filename) { |
510 | gzFile outfile; |
511 | if ((outfile = gzopen(filename,"wb")) == NULL((void*)0)) { |
512 | return(1); |
513 | } |
514 | foma_net_print(net, outfile); |
515 | gzclose(outfile); |
516 | return(0); |
517 | } |
518 | |
519 | struct fsm *fsm_read_binary_file_multiple(fsm_read_binary_handle fsrh) { |
520 | char *net_name; |
521 | struct fsm *net; |
522 | struct io_buf_handle *iobh; |
523 | iobh = (struct io_buf_handle *) fsrh; |
524 | net = io_net_read(iobh, &net_name); |
525 | if (net == NULL((void*)0)) { |
526 | io_free(iobh); |
527 | return(NULL((void*)0)); |
528 | } else { |
529 | free(net_name); |
530 | return(net); |
531 | } |
532 | } |
533 | |
534 | fsm_read_binary_handle fsm_read_binary_file_multiple_init(char *filename) { |
535 | |
536 | struct io_buf_handle *iobh; |
537 | fsm_read_binary_handle fsm_read_handle; |
538 | |
539 | iobh = io_init(); |
540 | if (io_gz_file_to_mem(iobh, filename) == 0) { |
541 | io_free(iobh); |
542 | return NULL((void*)0); |
543 | } |
544 | fsm_read_handle = (void *) iobh; |
545 | return(fsm_read_handle); |
546 | } |
547 | |
548 | struct fsm *fsm_read_binary_file(char *filename) { |
549 | char *net_name; |
550 | struct fsm *net; |
551 | struct io_buf_handle *iobh; |
552 | iobh = io_init(); |
553 | if (io_gz_file_to_mem(iobh, filename) == 0) { |
554 | io_free(iobh); |
555 | return NULL((void*)0); |
556 | } |
557 | net = io_net_read(iobh, &net_name); |
558 | io_free(iobh); |
559 | return(net); |
560 | } |
561 | |
562 | int save_defined(struct defined_networks *def, char *filename) { |
563 | struct defined_networks *d; |
564 | gzFile outfile; |
565 | if (def == NULL((void*)0)) { |
566 | fprintf(stderrstderr, "No defined networks.\n"); |
567 | return(0); |
568 | } |
569 | if ((outfile = gzopen(filename, "wb")) == NULL((void*)0)) { |
570 | printf("Error opening file %s for writing.\n", filename); |
571 | return(-1); |
572 | } |
573 | printf("Writing definitions to file %s.\n", filename); |
574 | for (d = def; d != NULL((void*)0); d = d->next) { |
575 | if (!d->net) { |
576 | printf("Skipping definition without network.\n"); |
577 | continue; |
578 | } |
579 | strncpy(d->net->name, d->name, FSM_NAME_LEN40); |
580 | foma_net_print(d->net, outfile); |
581 | } |
582 | gzclose(outfile); |
583 | return(1); |
584 | } |
585 | |
586 | int load_defined(struct defined_networks *def, char *filename) { |
587 | struct fsm *net; |
588 | char *net_name; |
589 | struct io_buf_handle *iobh; |
590 | |
591 | iobh = io_init(); |
592 | printf("Loading definitions from %s.\n",filename); |
593 | if (io_gz_file_to_mem(iobh, filename) == 0) { |
594 | fprintf(stderrstderr, "File error.\n"); |
595 | io_free(iobh); |
596 | return 0; |
597 | } |
598 | while ((net = io_net_read(iobh, &net_name)) != NULL((void*)0)) { |
599 | add_defined(def, net, net_name); |
600 | } |
601 | io_free(iobh); |
602 | return(1); |
603 | } |
604 | |
605 | static INLINEinline int explode_line(char *buf, int *values) { |
606 | int i, j, items; |
607 | j = i = items = 0; |
Although the value stored to 'i' is used in the enclosing expression, the value is never actually read from 'i' | |
608 | for (;;) { |
609 | for (i = j; *(buf+j) != ' ' && *(buf+j) != '\0'; j++) { } |
610 | if (*(buf+j) == '\0') { |
611 | *(values+items) = atoi(buf+i); |
612 | items++; |
613 | break; |
614 | } else{ |
615 | *(buf+j) = '\0'; |
616 | *(values+items) = atoi(buf+i); |
617 | items++; |
618 | j++; |
619 | } |
620 | } |
621 | return(items); |
622 | } |
623 | |
624 | /* The file format we use is an extremely simple text format */ |
625 | /* which is gzip compressed through libz and consists of the following sections: */ |
626 | |
627 | /* ##foma-net VERSION##*/ |
628 | /* ##props## */ |
629 | /* PROPERTIES LINE */ |
630 | /* ##sigma## */ |
631 | /* ...SIGMA LINES... */ |
632 | /* ##states## */ |
633 | /* ...TRANSITION LINES... */ |
634 | /* ##end## */ |
635 | |
636 | /* Several networks may be concatenated in one file */ |
637 | |
638 | /* The initial identifier is "##foma-net 1.0##" */ |
639 | /* where 1.0 is the version number for the file format */ |
640 | /* followed by the line "##props##" */ |
641 | /* which is followed by a line of space separated integers */ |
642 | /* which correpond to: */ |
643 | |
644 | /* arity arccount statecount linecount finalcount pathcount is_deterministic */ |
645 | /* is_pruned is_minimized is_epsilon_free is_loop_free is_completed name */ |
646 | |
647 | /* where name is used if defined networks are saved/loaded */ |
648 | |
649 | /* Following the props line, we accept anything (for future expansion) */ |
650 | /* until we find ##sigma## */ |
651 | |
652 | /* the section beginning with "##sigma##" consists of lines with two fields: */ |
653 | /* number string */ |
654 | /* correponding to the symbol number and the symbol string */ |
655 | |
656 | /* the section beginning with "##states##" consists of lines of ASCII integers */ |
657 | /* with 2-5 fields to avoid some redundancy in every line corresponding to a */ |
658 | /* transition where otherwise state numbers would be unnecessarily repeated and */ |
659 | /* out symbols also (if in = out as is the case for recognizers/simple automata) */ |
660 | |
661 | /* The information depending on the number of fields in the lines is as follows: */ |
662 | |
663 | /* 2: in target (here state_no is the same as the last mentioned one and out = in) */ |
664 | /* 3: in out target (again, state_no is the same as the last mentioned one) */ |
665 | /* 4: state_no in target final_state (where out = in) */ |
666 | /* 5: state_no in out target final_state */ |
667 | |
668 | /* There is no harm in always using 5 fields; however this will take up more space */ |
669 | |
670 | /* As in struct fsm_state, states without transitions are represented as a 4-field: */ |
671 | /* state_no -1 -1 final_state (since in=out for 4-field lines, out = -1 as well) */ |
672 | |
673 | /* AS gzopen will read uncompressed files as well, one can gunzip a file */ |
674 | /* that contains a network and still read it */ |
675 | |
676 | struct fsm *io_net_read(struct io_buf_handle *iobh, char **net_name) { |
677 | |
678 | char buf[READ_BUF_SIZE4096]; |
679 | struct fsm *net; |
680 | struct fsm_state *fsm; |
681 | |
682 | char *new_symbol; |
683 | int i, items, new_symbol_number, laststate, lineint[5], *cm; |
684 | int extras; |
685 | char last_final = '1'; |
686 | |
687 | if (io_gets(iobh, buf) == 0) { |
688 | return NULL((void*)0); |
689 | } |
690 | |
691 | net = fsm_create(""); |
692 | |
693 | if (strcmp(buf, "##foma-net 1.0##") != 0) { |
694 | fsm_destroy(net); |
695 | perror("File format error foma!\n"); |
696 | return NULL((void*)0); |
697 | } |
698 | io_gets(iobh, buf); |
699 | if (strcmp(buf, "##props##") != 0) { |
700 | perror("File format error props!\n"); |
701 | fsm_destroy(net); |
702 | return NULL((void*)0); |
703 | } |
704 | /* Properties */ |
705 | io_gets(iobh, buf); |
706 | extras = 0; |
707 | sscanf(buf, "%i %i %i %i %i %lld %i %i %i %i %i %i %s", &net->arity, &net->arccount, &net->statecount, &net->linecount, &net->finalcount, &net->pathcount, &net->is_deterministic, &net->is_pruned, &net->is_minimized, &net->is_epsilon_free, &net->is_loop_free, &extras, buf); |
708 | strncpy(net->name, buf, FSM_NAME_LEN40); |
709 | *net_name = strdup(buf); |
710 | io_gets(iobh, buf); |
711 | |
712 | net->is_completed = (extras & 3); |
713 | net->arcs_sorted_in = (extras & 12) >> 2; |
714 | net->arcs_sorted_out = (extras & 48) >> 4; |
715 | |
716 | /* Sigma */ |
717 | while (strcmp(buf, "##sigma##") != 0) { /* Loop until we encounter ##sigma## */ |
718 | if (buf[0] == '\0') { |
719 | printf("File format error at sigma definition!\n"); |
720 | fsm_destroy(net); |
721 | return NULL((void*)0); |
722 | } |
723 | io_gets(iobh, buf); |
724 | } |
725 | |
726 | for (;;) { |
727 | io_gets(iobh, buf); |
728 | if (buf[0] == '#') break; |
729 | if (buf[0] == '\0') continue; |
730 | new_symbol = strstr(buf, " "); |
731 | new_symbol[0] = '\0'; |
732 | new_symbol++; |
733 | if (new_symbol[0] == '\0') { |
734 | sscanf(buf,"%i", &new_symbol_number); |
735 | sigma_add_number(net->sigma, "\n", new_symbol_number); |
736 | } else { |
737 | sscanf(buf,"%i", &new_symbol_number); |
738 | sigma_add_number(net->sigma, new_symbol, new_symbol_number); |
739 | } |
740 | } |
741 | |
742 | /* States */ |
743 | if (strcmp(buf, "##states##") != 0) { |
744 | printf("File format error!\n"); |
745 | return NULL((void*)0); |
746 | } |
747 | net->states = malloc(net->linecount*sizeof(struct fsm_state)); |
748 | fsm = net->states; |
749 | laststate = -1; |
750 | for (i=0; ;i++) { |
751 | io_gets(iobh, buf); |
752 | if (buf[0] == '#') break; |
753 | |
754 | /* scanf is just too slow here */ |
755 | |
756 | //items = sscanf(buf, "%i %i %i %i %i",&lineint[0], &lineint[1], &lineint[2], &lineint[3], &lineint[4]); |
757 | |
758 | items = explode_line(buf, &lineint[0]); |
759 | |
760 | switch (items) { |
761 | case 2: |
762 | (fsm+i)->state_no = laststate; |
763 | (fsm+i)->in = lineint[0]; |
764 | (fsm+i)->out = lineint[0]; |
765 | (fsm+i)->target = lineint[1]; |
766 | (fsm+i)->final_state = last_final; |
767 | break; |
768 | case 3: |
769 | (fsm+i)->state_no = laststate; |
770 | (fsm+i)->in = lineint[0]; |
771 | (fsm+i)->out = lineint[1]; |
772 | (fsm+i)->target = lineint[2]; |
773 | (fsm+i)->final_state = last_final; |
774 | break; |
775 | case 4: |
776 | (fsm+i)->state_no = lineint[0]; |
777 | (fsm+i)->in = lineint[1]; |
778 | (fsm+i)->out = lineint[1]; |
779 | (fsm+i)->target = lineint[2]; |
780 | (fsm+i)->final_state = lineint[3]; |
781 | laststate = lineint[0]; |
782 | last_final = lineint[3]; |
783 | break; |
784 | case 5: |
785 | (fsm+i)->state_no = lineint[0]; |
786 | (fsm+i)->in = lineint[1]; |
787 | (fsm+i)->out = lineint[2]; |
788 | (fsm+i)->target = lineint[3]; |
789 | (fsm+i)->final_state = lineint[4]; |
790 | laststate = lineint[0]; |
791 | last_final = lineint[4]; |
792 | break; |
793 | default: |
794 | printf("File format error\n"); |
795 | return NULL((void*)0); |
796 | } |
797 | if (laststate > 0) { |
798 | (fsm+i)->start_state = 0; |
799 | } else if (laststate == -1) { |
800 | (fsm+i)->start_state = -1; |
801 | } else { |
802 | (fsm+i)->start_state = 1; |
803 | } |
804 | |
805 | } |
806 | if (strcmp(buf, "##cmatrix##") == 0) { |
807 | cmatrix_init(net); |
808 | cm = net->medlookup->confusion_matrix; |
809 | for (;;) { |
810 | io_gets(iobh, buf); |
811 | if (buf[0] == '#') break; |
812 | sscanf(buf,"%i", &i); |
813 | *cm = i; |
814 | cm++; |
815 | } |
816 | } |
817 | if (strcmp(buf, "##end##") != 0) { |
818 | printf("File format error!\n"); |
819 | return NULL((void*)0); |
820 | } |
821 | return(net); |
822 | } |
823 | |
824 | static int io_gets(struct io_buf_handle *iobh, char *target) { |
825 | int i; |
826 | for (i = 0; *((iobh->io_buf_ptr)+i) != '\n' && *((iobh->io_buf_ptr)+i) != '\0'; i++) { |
827 | *(target+i) = *((iobh->io_buf_ptr)+i); |
828 | } |
829 | *(target+i) = '\0'; |
830 | if (*((iobh->io_buf_ptr)+i) == '\0') |
831 | (iobh->io_buf_ptr) = (iobh->io_buf_ptr) + i; |
832 | else |
833 | (iobh->io_buf_ptr) = (iobh->io_buf_ptr) + i + 1; |
834 | |
835 | return(i); |
836 | } |
837 | |
838 | int foma_net_print(struct fsm *net, gzFile outfile) { |
839 | struct sigma *sigma; |
840 | struct fsm_state *fsm; |
841 | int i, maxsigma, laststate, *cm, extras; |
842 | |
843 | /* Header */ |
844 | gzprintf(outfile, "%s","##foma-net 1.0##\n"); |
845 | |
846 | /* Properties */ |
847 | gzprintf(outfile, "%s","##props##\n"); |
848 | |
849 | extras = (net->is_completed) | (net->arcs_sorted_in << 2) | (net->arcs_sorted_out << 4); |
850 | |
851 | gzprintf(outfile, |
852 | "%i %i %i %i %i %lld %i %i %i %i %i %i %s\n", net->arity, net->arccount, net->statecount, net->linecount, net->finalcount, net->pathcount, net->is_deterministic, net->is_pruned, net->is_minimized, net->is_epsilon_free, net->is_loop_free, extras, net->name); |
853 | |
854 | /* Sigma */ |
855 | gzprintf(outfile, "%s","##sigma##\n"); |
856 | for (sigma = net->sigma; sigma != NULL((void*)0) && sigma->number != -1; sigma = sigma->next) { |
857 | gzprintf(outfile, "%i %s\n",sigma->number, sigma->symbol); |
858 | } |
859 | |
860 | /* State array */ |
861 | laststate = -1; |
862 | gzprintf(outfile, "%s","##states##\n"); |
863 | for (fsm = net->states; fsm->state_no !=-1; fsm++) { |
864 | if (fsm->state_no != laststate) { |
865 | if (fsm->in != fsm->out) { |
866 | gzprintf(outfile, "%i %i %i %i %i\n",fsm->state_no, fsm->in, fsm->out, fsm->target, fsm->final_state); |
867 | } else { |
868 | gzprintf(outfile, "%i %i %i %i\n",fsm->state_no, fsm->in, fsm->target, fsm->final_state); |
869 | } |
870 | } else { |
871 | if (fsm->in != fsm->out) { |
872 | gzprintf(outfile, "%i %i %i\n", fsm->in, fsm->out, fsm->target); |
873 | } else { |
874 | gzprintf(outfile, "%i %i\n", fsm->in, fsm->target); |
875 | } |
876 | } |
877 | laststate = fsm->state_no; |
878 | } |
879 | /* Sentinel for states */ |
880 | gzprintf(outfile, "-1 -1 -1 -1 -1\n"); |
881 | |
882 | /* Store confusion matrix */ |
883 | if (net->medlookup != NULL((void*)0) && net->medlookup->confusion_matrix != NULL((void*)0)) { |
884 | |
885 | gzprintf(outfile, "%s","##cmatrix##\n"); |
886 | cm = net->medlookup->confusion_matrix; |
887 | maxsigma = sigma_max(net->sigma)+1; |
888 | for (i=0; i < maxsigma*maxsigma; i++) { |
889 | gzprintf(outfile, "%i\n", *(cm+i)); |
890 | } |
891 | } |
892 | |
893 | /* End */ |
894 | gzprintf(outfile, "%s","##end##\n"); |
895 | return(1); |
896 | } |
897 | |
898 | int net_print_att(struct fsm *net, FILE *outfile) { |
899 | struct fsm_state *fsm; |
900 | struct fsm_sigma_list *sl; |
901 | int i, prev; |
902 | |
903 | fsm = net->states; |
904 | sl = sigma_to_list(net->sigma); |
905 | if (sigma_max(net->sigma) >= 0) { |
906 | (sl+0)->symbol = g_att_epsilon; |
907 | } |
908 | for (i=0; (fsm+i)->state_no != -1; i++) { |
909 | if ((fsm+i)->target != -1) { |
910 | fprintf(outfile, "%i\t%i\t%s\t%s\n",(fsm+i)->state_no,(fsm+i)->target, (sl+(fsm+i)->in)->symbol, (sl+(fsm+i)->out)->symbol); |
911 | } |
912 | } |
913 | prev = -1; |
914 | for (i=0; (fsm+i)->state_no != -1; prev = (fsm+i)->state_no, i++) { |
915 | if ((fsm+i)->state_no != prev && (fsm+i)->final_state == 1) { |
916 | fprintf(outfile, "%i\n",(fsm+i)->state_no); |
917 | } |
918 | } |
919 | free(sl); |
920 | return(1); |
921 | } |
922 | |
923 | static size_t io_get_gz_file_size(char *filename) { |
924 | |
925 | FILE *infile; |
926 | size_t numbytes; |
927 | unsigned char bytes[4]; |
928 | unsigned int ints[4], i; |
929 | |
930 | /* The last four bytes in a .gz file shows the size of the uncompressed data */ |
931 | infile = fopen(filename, "r"); |
932 | fseek(infile, -4, SEEK_END2); |
933 | fread(&bytes, 1, 4, infile); |
934 | fclose(infile); |
935 | for (i = 0 ; i < 4 ; i++) { |
936 | ints[i] = bytes[i]; |
937 | } |
938 | numbytes = ints[0] | (ints[1] << 8) | (ints[2] << 16 ) | (ints[3] << 24); |
939 | return(numbytes); |
940 | } |
941 | |
942 | static size_t io_get_regular_file_size(char *filename) { |
943 | |
944 | FILE *infile; |
945 | size_t numbytes; |
946 | |
947 | infile = fopen(filename, "r"); |
948 | fseek(infile, 0L, SEEK_END2); |
949 | numbytes = ftell(infile); |
950 | fclose(infile); |
951 | return(numbytes); |
952 | } |
953 | |
954 | |
955 | static size_t io_get_file_size(char *filename) { |
956 | gzFile FILE; |
957 | size_t size; |
958 | FILE = gzopen(filename, "r"); |
959 | if (FILE == NULL((void*)0)) { |
960 | return(0); |
961 | } |
962 | if (gzdirect(FILE) == 1) { |
963 | gzclose(FILE); |
964 | size = io_get_regular_file_size(filename); |
965 | } else { |
966 | gzclose(FILE); |
967 | size = io_get_gz_file_size(filename); |
968 | } |
969 | return(size); |
970 | } |
971 | |
972 | size_t io_gz_file_to_mem(struct io_buf_handle *iobh, char *filename) { |
973 | |
974 | size_t size; |
975 | gzFile FILE; |
976 | |
977 | size = io_get_file_size(filename); |
978 | if (size == 0) { |
979 | return 0; |
980 | } |
981 | (iobh->io_buf) = malloc((size+1)*sizeof(char)); |
982 | FILE = gzopen(filename, "rb"); |
983 | gzread(FILE, iobh->io_buf, size); |
984 | gzclose(FILE); |
985 | *((iobh->io_buf)+size) = '\0'; |
986 | iobh->io_buf_ptr = iobh->io_buf; |
987 | return(size); |
988 | } |
989 | |
990 | typedef struct BOM { |
991 | char code[4]; |
992 | int len; |
993 | char* name; |
994 | } BOM; |
995 | |
996 | static BOM BOM_codes[] = { |
997 | { { 0xEF, 0xBB, 0xBF }, 3, "UTF-8"}, |
998 | { { 0xFF, 0xFE, 0x00, 0x00 }, 4, "UTF-32LE" }, |
999 | { { 0x00, 0x00, 0xFE, 0xFF }, 4, "UTF-32BE" }, |
1000 | { { 0xFF, 0xFE }, 2, "UTF16-LE" }, |
1001 | { { 0xFE, 0xFF }, 2, "UTF16-BE" }, |
1002 | { { 0, } , 0, NULL((void*)0) }, |
1003 | }; |
1004 | |
1005 | BOM *check_BOM(char *buffer) { |
1006 | BOM *bom; |
1007 | for(bom = BOM_codes; bom->len; bom++) { |
1008 | if(strncmp(bom->code, buffer, bom->len) == 0) { |
1009 | return bom; |
1010 | } |
1011 | } |
1012 | return NULL((void*)0); |
1013 | } |
1014 | |
1015 | char *file_to_mem(char *name) { |
1016 | FILE *infile; |
1017 | size_t numbytes; |
1018 | char *buffer; |
1019 | BOM *bom; |
1020 | infile = fopen(name, "r"); |
1021 | if(infile == NULL((void*)0)) { |
1022 | printf("Error opening file '%s'\n",name); |
1023 | return NULL((void*)0); |
1024 | } |
1025 | fseek(infile, 0L, SEEK_END2); |
1026 | numbytes = ftell(infile); |
1027 | fseek(infile, 0L, SEEK_SET0); |
1028 | buffer = (char*)malloc((numbytes+1) * sizeof(char)); |
1029 | if(buffer == NULL((void*)0)) { |
1030 | printf("Error reading file '%s'\n",name); |
1031 | return NULL((void*)0); |
1032 | } |
1033 | if (fread(buffer, sizeof(char), numbytes, infile) != numbytes) { |
1034 | printf("Error reading file '%s'\n",name); |
1035 | return NULL((void*)0); |
1036 | } |
1037 | |
1038 | bom = check_BOM(buffer); |
1039 | if (bom != NULL((void*)0)) { |
1040 | printf("%s BOM mark is detected in file '%s'.\n",bom->name,name); |
1041 | return NULL((void*)0); |
1042 | } |
1043 | fclose(infile); |
1044 | *(buffer+numbytes)='\0'; |
1045 | return(buffer); |
1046 | } |