commit 531e9aee9e6468a3bcd73741fe59f8f1effa78fa Author: Daniel Swanson Date: Mon Aug 2 14:35:38 2021 -0500 helper functions continue to be fun diff --git a/lttoolbox/att_compiler.cc b/lttoolbox/att_compiler.cc index eaa0dd8..184f62c 100644 --- a/lttoolbox/att_compiler.cc +++ b/lttoolbox/att_compiler.cc @@ -435,37 +435,11 @@ AttCompiler::classify_backwards(int state, set& path) void AttCompiler::write(FILE *output) { -// FILE* output = fopen(file_name, "wb"); - fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); - uint64_t features = 0; - write_le(output, features); - - Transducer punct_fst = extract_transducer(PUNCT); - - /* Non-multichar symbols. */ - Compression::string_write(UString(letters.begin(), letters.end()), output); - /* Multichar symbols. */ - alphabet.write(output); - /* And now the FST. */ - if(punct_fst.numberOfTransitions() == 0) - { - Compression::multibyte_write(1, output); - } - else - { - Compression::multibyte_write(2, output); - } - Compression::string_write("main@standard"_u, output); - Transducer word_fst = extract_transducer(WORD); - word_fst.write(output); - cout << "main@standard" << " " << word_fst.size(); - cout << " " << word_fst.numberOfTransitions() << endl; - Compression::string_write("final@inconditional"_u, output); - if(punct_fst.numberOfTransitions() != 0) - { - punct_fst.write(output); - cout << "final@inconditional" << " " << punct_fst.size(); - cout << " " << punct_fst.numberOfTransitions() << endl; - } -// fclose(output); + UString letters = UString(letters.begin(), letters.end()); + map trans; + + trans["main@standard"_u] = extract_transducer(WORD); + trans["final@inconditional"_u] = extract_transducer(PUNCT); + + write_transducer_set(output, letters, alphabet, trans, true); } diff --git a/lttoolbox/compiler.cc b/lttoolbox/compiler.cc index 761caf3..a7fc8fa 100644 --- a/lttoolbox/compiler.cc +++ b/lttoolbox/compiler.cc @@ -946,40 +946,7 @@ Compiler::procRegexp() void Compiler::write(FILE *output) { - fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); - uint64_t features = 0; - features |= LTF_MMAP; - write_le_64(output, features); - - StringWriter sw; - StringRef letter_loc = sw.add(letters); - for (auto& it : alphabet.getTags()) { - sw.add(it); - } - for (auto& it : sections) { - sw.add(it.first); - } - - sw.write(output); - - // letters - write_le_32(output, letter_loc.start); - write_le_32(output, letter_loc.count); - - // symbols - alphabet.write_mmap(output, sw); - - // transducers - write_le_64(output, sections.size()); - - for(auto& it : sections) { - cout << it.first << " " << it.second.size(); - cout << " " << it.second.numberOfTransitions() << endl; - StringRef loc = sw.add(it.first); - write_le_32(output, loc.start); - write_le_32(output, loc.count); - it.second.write_mmap(output, alphabet); - } + write_transducer_set(output, letters, alphabet, sections); } void diff --git a/lttoolbox/lt_print.cc b/lttoolbox/lt_print.cc index c2c40db..5ee894b 100644 --- a/lttoolbox/lt_print.cc +++ b/lttoolbox/lt_print.cc @@ -130,67 +130,10 @@ int main(int argc, char *argv[]) } Alphabet alphabet; - set alphabetic_chars; - + UString letters; map transducers; - bool mmap = false; - fpos_t pos; - if (fgetpos(input, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, input); - if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le(input); - if (features >= LTF_UNKNOWN) { - throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); - } - mmap = features & LTF_MMAP; - } - else { - // Old binary format - fsetpos(input, &pos); - } - } - - if (mmap) { - StringWriter sw; - sw.read(input); - - uint32_t s = read_le_32(input); - uint32_t c = read_le_32(input); - vector vec; - ustring_to_vec32(sw.get(s, c), vec); - alphabetic_chars.insert(vec.begin(), vec.end()); - - alphabet.read_mmap(input, sw); - - uint64_t tr_count = read_le_64(input); - for (uint64_t i = 0; i < tr_count; i++) { - uint32_t s = read_le_32(input); - uint32_t c = read_le_32(input); - UString name = UString{sw.get(s, c)}; - transducers[name].read_mmap(input, alphabet); - } - } else { - // letters - int len = Compression::multibyte_read(input); - while(len > 0) { - alphabetic_chars.insert(static_cast(Compression::multibyte_read(input))); - len--; - } - - // symbols - alphabet.read(input); - - len = Compression::multibyte_read(input); - - while(len > 0) { - UString name = Compression::string_read(input); - transducers[name].read(input); - - len--; - } - } + read_transducer_set(input, letters, alphabet, transducers); ///////////////////// diff --git a/lttoolbox/lt_trim.cc b/lttoolbox/lt_trim.cc index e1e3dc3..7547665 100644 --- a/lttoolbox/lt_trim.cc +++ b/lttoolbox/lt_trim.cc @@ -38,79 +38,18 @@ void endProgram(char *name) exit(EXIT_FAILURE); } -std::pair, std::map > -read_fst(FILE *bin_file) -{ - Alphabet new_alphabet; - - std::map transducers; - - fpos_t pos; - bool mmap = false; - if (fgetpos(bin_file, &pos) == 0) { - char header[4]{}; - fread_unlocked(header, 1, 4, bin_file); - if (strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { - auto features = read_le(bin_file); - if (features >= LTF_UNKNOWN) { - throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); - } - mmap = features & LTF_MMAP; - } - else { - // Old binary format - fsetpos(bin_file, &pos); - } - } - - UString letters; - - if (mmap) { - StringWriter sw; - sw.read(bin_file); - - uint32_t s = read_le_32(bin_file); - uint32_t c = read_le_32(bin_file); - letters = UString{sw.get(s, c)}; - - new_alphabet.read_mmap(bin_file, sw); - - uint64_t tr_count = read_le_64(bin_file); - for (uint64_t i = 0; i < tr_count; i++) { - uint32_t s = read_le_32(bin_file); - uint32_t c = read_le_32(bin_file); - UString name = UString{sw.get(s, c)}; - transducers[name].read_mmap(bin_file, new_alphabet); - } - } else { - // letters - letters = Compression::string_read(bin_file); - - // symbols - new_alphabet.read(bin_file); - - int len = Compression::multibyte_read(bin_file); - - while(len > 0) { - UString name = Compression::string_read(bin_file); - transducers[name].read(bin_file); - - len--; - } - } - - return make_pair(make_pair(new_alphabet, letters), transducers); -} - std::pair, std::map > trim(FILE *file_mono, FILE *file_bi) { - std::pair, std::map > alph_trans_mono = read_fst(file_mono); - Alphabet alph_mono = alph_trans_mono.first.first; - std::map trans_mono = alph_trans_mono.second; - std::pair, std::map > alph_trans_bi = read_fst(file_bi); - Alphabet alph_bi = alph_trans_bi.first.first; - std::map trans_bi = alph_trans_bi.second; + UString letters_mono; + Alphabet alph_mono; + std::map trans_mono; + read_transducer_set(file_mono, letters_mono, alph_mono, trans_mono); + + UString letters_bi; + Alphabet alph_bi; + std::map trans_bi; + read_transducer_set(file_bi, letters_bi, alph_bi, trans_bi); // The prefix transducer is the union of all transducers from bidix, // with a ".*" appended @@ -146,15 +85,13 @@ trim(FILE *file_mono, FILE *file_bi) alph_mono, alph_prefix); - cout << it->first << " " << it->second.size(); - cout << " " << it->second.numberOfTransitions() << endl; if(it->second.numberOfTransitions() == 0) { - cerr << "Warning: empty section! Skipping it ..."<first << " is empty! Skipping it ..."<first].clear(); } else if(trimmed.hasNoFinals()) { - cerr << "Warning: section had no final state after trimming! Skipping it ..."<first << " had no final state after trimming! Skipping it ..."<first].clear(); } else { @@ -163,8 +100,7 @@ trim(FILE *file_mono, FILE *file_bi) } } - alph_trans_mono.second = trans_mono; - return alph_trans_mono; + return make_pair(make_pair(alph_mono, letters_mono), trans_mono); } @@ -195,22 +131,6 @@ int main(int argc, char *argv[]) UString letters = trimmed.first.second; std::map trans_t = trimmed.second; - int n_transducers = 0; - for(auto& it : trans_t) { - if(!(it.second.isEmpty())) - { - n_transducers++; - } - } - - if(n_transducers == 0) - { - cerr << "Error: Trimming gave empty transducer!" << endl; - cerr << "Hint: There are no words in bilingual dictionary that match " - "words in both monolingual dictionaries?" << endl; - exit(EXIT_FAILURE); - } - // Write the file: FILE *output = fopen(argv[3], "wb"); if(!output) @@ -219,20 +139,13 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } - // letters - Compression::string_write(letters, output); - - // symbols - alph_t.write(output); + int n_trans = write_transducer_set(output, letters, alph_t, trans_t, true); - // transducers - Compression::multibyte_write(n_transducers, output); - for(auto& it : trans_t) { - if(!(it.second.isEmpty())) - { - Compression::string_write(it.first, output); - it.second.write(output); - } + if (n_trans == 0) { + cerr << "Error: Trimming gave empty transducer!" << endl; + cerr << "Hint: There are no words in bilingual dictionary that match " + "words in both monolingual dictionaries?" << endl; + exit(EXIT_FAILURE); } fclose(analyser); diff --git a/lttoolbox/string_writer.cc b/lttoolbox/string_writer.cc index 4bc3d64..15d81f7 100644 --- a/lttoolbox/string_writer.cc +++ b/lttoolbox/string_writer.cc @@ -21,7 +21,7 @@ #include StringRef -StringWriter::add(const UString& s) +StringWriter::add(UString_view s) { auto start = buffer.find(s); if (start == UString::npos) { diff --git a/lttoolbox/string_writer.h b/lttoolbox/string_writer.h index 15fcaf3..12000a4 100644 --- a/lttoolbox/string_writer.h +++ b/lttoolbox/string_writer.h @@ -30,7 +30,7 @@ struct StringRef { class StringWriter { public: UString buffer; - StringRef add(const UString& s); + StringRef add(UString_view s); UString_view get(const uint32_t start, const uint32_t count); UString_view get(const StringRef& ref); void read(FILE* in); diff --git a/lttoolbox/transducer.cc b/lttoolbox/transducer.cc index d783c28..980c024 100644 --- a/lttoolbox/transducer.cc +++ b/lttoolbox/transducer.cc @@ -1374,3 +1374,114 @@ Transducer::intersect(Transducer &trimmer, // (instead of exiting the whole program) if no finals. return trimmed; } + +void +read_transducer_set(FILE* input, UString& letters, Alphabet& alpha, + map& trans) +{ + fpos_t pos; + bool mmap = false; + if (fgetpos(input, &pos) == 0) { + char header[4]{}; + auto r = fread_unlocked(header, 1, 4, input); + if (r == 4 && strncmp(header, HEADER_LTTOOLBOX, 4) == 0) { + auto features = read_le_64(input); + if (features >= LTF_UNKNOWN) { + throw std::runtime_error("FST has features that are unknown to this version of lttoolbox - upgrade!"); + } + mmap = features & LTF_MMAP; + } + else { + // Old binary format + fsetpos(input, &pos); + } + } + + if (mmap) { + // make copies of all the strings we get from StringWriter + // because it gets deallocated when the function returns + StringWriter sw; + sw.read(input); + + // letters + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + letters = UString{sw.get(s, c)}; + + // symbols + alpha.read_mmap(input, sw); + + uint64_t tr_count = read_le_64(input); + for (uint64_t i = 0; i < tr_count; i++) { + uint32_t s = read_le_32(input); + uint32_t c = read_le_32(input); + UString name = UString{sw.get(s, c)}; + trans[name].read_mmap(input, alpha); + } + } else { + // letters + letters = Compression::string_read(input); + + // symbols + alpha.read(input); + + int len = Compression::multibyte_read(input); + + while(len > 0) { + UString name = Compression::string_read(input); + trans[name].read(input); + + len--; + } + } +} + +uint64_t +write_transducer_set(FILE* output, UString_view letters, Alphabet& alpha, + map& trans, + bool skip_empty) +{ + fwrite_unlocked(HEADER_LTTOOLBOX, 1, 4, output); + uint64_t features = 0; + features |= LTF_MMAP; + write_le_64(output, features); + + uint64_t transducer_count = trans.size(); + + StringWriter sw; + StringRef letter_loc = sw.add(letters); + for (auto& it : alpha.getTags()) { + sw.add(it); + } + for (auto& it : trans) { + if (skip_empty && it.second.isEmpty()) { + transducer_count--; + continue; + } + sw.add(it.first); + } + sw.write(output); + + // letters + write_le_32(output, letter_loc.start); + write_le_32(output, letter_loc.count); + + // symbols + alpha.write_mmap(output, sw); + + // transducers + write_le_64(output, transducer_count); + for (auto& it : trans) { + if (skip_empty && it.second.isEmpty()) { + continue; + } + cout << it.first << " " << it.second.size(); + cout << " " << it.second.numberOfTransitions() << endl; + StringRef loc = sw.add(it.first); + write_le_32(output, loc.start); + write_le_32(output, loc.count); + it.second.write_mmap(output, alpha); + } + + return transducer_count; +} diff --git a/lttoolbox/transducer.h b/lttoolbox/transducer.h index 89b0d9f..394b683 100644 --- a/lttoolbox/transducer.h +++ b/lttoolbox/transducer.h @@ -421,4 +421,11 @@ public: }; +void read_transducer_set(FILE* input, UString& letters, Alphabet& alpha, + map& trans); +uint64_t write_transducer_set(FILE* output, + UString_view letters, Alphabet& alpha, + map& trans, + bool skip_empty=false); + #endif