Index: trunk/apertium/.gitignore =================================================================== --- trunk/apertium/.gitignore (revision 69620) +++ trunk/apertium/.gitignore (revision 69621) @@ -28,7 +28,7 @@ /*.pc -# /apertium/wildcard +# /apertium/ /apertium/.libs /apertium/apertium @@ -79,10 +79,3 @@ /apertium/transfer.rnc /apertium/stamp-* - -# Tests -/tests/**/Makefile.in -/tests/**/Makefile -/tests/**/.libs - -/tests/tagger/test-find-similar-ambiguity-class Index: trunk/apertium/Makefile.am =================================================================== --- trunk/apertium/Makefile.am (revision 69620) +++ trunk/apertium/Makefile.am (revision 69621) @@ -1,5 +1,5 @@ -SUBDIRS = $(GENERIC_LIBRARY_NAME) tests -DIST_SUBDIRS = $(GENERIC_LIBRARY_NAME) tests +SUBDIRS = $(GENERIC_LIBRARY_NAME) +DIST_SUBDIRS = $(GENERIC_LIBRARY_NAME) modesdir=$(prefix)/share/apertium/modes Index: trunk/apertium/apertium/apertium_tagger.cc =================================================================== --- trunk/apertium/apertium/apertium_tagger.cc (revision 69620) +++ trunk/apertium/apertium/apertium_tagger.cc (revision 69621) @@ -22,10 +22,12 @@ #include "basic_stream_tagger.h" #include "basic_stream_tagger_trainer.h" #include "basic_tagger.h" -#include "err_exception.h" #include "exception.h" #include "file_tagger.h" +#include "err_exception.h" +#include #include "linebreak.h" +#include #include "stream_5_3_1_tagger.h" #include "stream_5_3_1_tagger_trainer.h" #include "stream_5_3_2_tagger.h" @@ -32,18 +34,16 @@ #include "stream_5_3_2_tagger_trainer.h" #include "stream_5_3_3_tagger.h" #include "stream_5_3_3_tagger_trainer.h" -#include -#include #include #include -#include "getopt_long.h" #include #include #include #include #include +#include "getopt_long.h" #include #include #include @@ -61,20 +61,28 @@ apertium_tagger::apertium_tagger(int &argc, char **&argv) : argc(argc), argv(argv), The_val(), + The_indexptr(), FunctionTypeTypeOption_indexptr(), FunctionTypeOption_indexptr(), + TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), TheFunctionTypeOptionArgument(0), TheFlags() { try { while (true) { - The_val = getopt_long(argc, argv, "dfgmpr:s:t:u:wz", longopts, &The_indexptr); + The_val = + + getopt_long(argc, argv, "dfgmpr:s:t:u:wz", longopts, &The_indexptr); + + if (The_val == -1) break; + set_indexptr(); + switch (The_val) { case 'd': flagOptionCase(&basic_Tagger::Flags::getDebug, @@ -117,10 +125,10 @@ { std::stringstream what_; what_ << "invalid argument '" << optarg << "' for '--unigram'\n" - "Valid arguments are:\n" - " - '1'\n" - " - '2'\n" - " - '3'"; +"Valid arguments are:\n" +" - '1'\n" +" - '2'\n" +" - '3'"; throw Exception::apertium_tagger::InvalidArgument(what_); } break; @@ -283,25 +291,25 @@ void apertium_tagger::help() { std::cerr << -"Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" +"Usage: apertium-tagger [OPTION]... -g SERIALISED_BASIC_TAGGER \\\n" " [INPUT \\\n" " [OUTPUT]]\n" "\n" " or: apertium-tagger [OPTION]... -r ITERATIONS \\\n" " CORPUS \\\n" -" SERIALISED_TAGGER\n" +" SERIALISED_BASIC_TAGGER\n" "\n" " or: apertium-tagger [OPTION]... -s ITERATIONS \\\n" " DICTIONARY \\\n" " CORPUS \\\n" " TAGGER_SPECIFICATION \\\n" -" SERIALISED_TAGGER \\\n" +" SERIALISED_BASIC_TAGGER \\\n" " TAGGED_CORPUS \\\n" " UNTAGGED_CORPUS\n" "\n" " or: apertium-tagger [OPTION]... -s 0 \\\n" " -u MODEL \\\n" -" SERIALISED_TAGGER \\\n" +" SERIALISED_BASIC_TAGGER \\\n" " TAGGED_CORPUS\n" "\n" " or: apertium-tagger [OPTION]... -t ITERATIONS \\\n" @@ -308,7 +316,7 @@ " DICTIONARY \\\n" " CORPUS \\\n" " TAGGER_SPECIFICATION \\\n" -" SERIALISED_TAGGER\n" +" SERIALISED_BASIC_TAGGER\n" "\n" "\n" "Mandatory arguments to long options are mandatory for short options too.\n" @@ -345,6 +353,7 @@ align::align_(options_description_); } + std::string apertium_tagger::option_string(const int &indexptr_) { return option_string(longopts[indexptr_]); } @@ -355,6 +364,7 @@ return option_string_.str(); } + void apertium_tagger::locale_global_() { #if defined __clang__ @@ -374,6 +384,7 @@ #endif // defined __clang__ } + const struct option apertium_tagger::longopts[] = { {"help", no_argument, 0, 'h'}, {"debug", no_argument, 0, 'd'}, @@ -389,6 +400,8 @@ {"train", required_argument, 0, 't'}, {0, 0, 0, 0}}; + + void apertium_tagger::set_indexptr() { if (The_val == longopts[The_indexptr].val) return; @@ -402,6 +415,7 @@ } } + void apertium_tagger::flagOptionCase( bool (basic_Tagger::Flags::*GetFlag)() const, void (basic_Tagger::Flags::*SetFlag)(const bool &)) { @@ -416,35 +430,66 @@ } std::string apertium_tagger::option_string() { + + return option_string(The_indexptr); + } void apertium_tagger::functionTypeTypeOptionCase( const FunctionTypeType &FunctionTypeType_) { - if (FunctionTypeTypeOption_indexptr) { + if ( + + + FunctionTypeTypeOption_indexptr + + + ) { std::stringstream what_; what_ << "unexpected '" << option_string() << "' following '" - << option_string(*FunctionTypeTypeOption_indexptr) - << '\''; + << option_string( + + + *FunctionTypeTypeOption_indexptr + + + ) << '\''; throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); } TheFunctionTypeType = FunctionTypeType_; + + FunctionTypeTypeOption_indexptr = The_indexptr; + } -void apertium_tagger::functionTypeOptionCase( - const FunctionType &FunctionType_) { - if (FunctionTypeOption_indexptr) { +void +apertium_tagger::functionTypeOptionCase(const FunctionType &FunctionType_) { + if ( + + + FunctionTypeOption_indexptr + + + ) { std::stringstream what_; what_ << "unexpected '" << option_string() << "' following '" - << option_string(*FunctionTypeOption_indexptr) - << '\''; + << option_string( + + + *FunctionTypeOption_indexptr + + + ) << '\''; throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); } TheFunctionType = FunctionType_; + + FunctionTypeOption_indexptr = The_indexptr; + } void apertium_tagger::getIterationsArgument() { @@ -485,45 +530,6 @@ return N_0; } -template -static void try_open_fstream(const char *metavar, const char *filename, - T &stream) { - stream.open(filename); - if (stream.fail()) { - std::stringstream what_; - what_ << "can't open " << metavar << " file \"" << filename << "\""; - throw Exception::apertium_tagger::open_stream_fail(what_); - } -} - -static FILE *try_open_file(const char *metavar, const char *filename, - const char *flags) { - FILE *f = std::fopen(filename, flags); - if (f == NULL) { - std::stringstream what_; - what_ << "can't open " << metavar << " file \"" << filename << "\""; - throw Exception::apertium_tagger::fopen(what_); - } - return f; -} - -static inline FILE *try_open_file_utf8(const char *metavar, const char *filename, - const char *flags) { - FILE *f = try_open_file(metavar, filename, flags); -#ifdef _MSC_VER - _setmode(_fileno(f), _O_U8TEXT); -#endif // _MSC_VER - return f; -} - -static void try_close_file(const char *metavar, const char *filename, FILE *file) { - if (std::fclose(file) != 0) { - std::stringstream what_; - what_ << "can't close " << metavar << " file \"" << filename << "\""; - throw Exception::apertium_tagger::fclose(what_); - } -} - void apertium_tagger::g_StreamTagger(basic_StreamTagger &StreamTagger_) { locale_global_(); @@ -533,15 +539,20 @@ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); } - std::ifstream SerialisedAnalysisFrequencies; - try_open_fstream("SERIALISED_TAGGER", argv[optind], - SerialisedAnalysisFrequencies); + std::ifstream SerialisedAnalysisFrequencies(argv[optind]); + if (SerialisedAnalysisFrequencies.fail()) { + std::stringstream what_; + what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind] + << "\""; + throw Exception::apertium_tagger::ifstream_fail(what_); + } + try { StreamTagger_.deserialise(SerialisedAnalysisFrequencies); } catch (const basic_ExceptionType &basic_ExceptionType_) { std::stringstream what_; - what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] + what_ << "can't deserialise SERIALISED_BASIC_TAGGER file \"" << argv[optind] << "\" Reason: " << basic_ExceptionType_.what(); throw Exception::apertium_tagger::deserialise(what_); } @@ -552,9 +563,14 @@ return; } - std::wifstream Input_stream; - try_open_fstream("INPUT", argv[optind + 1], Input_stream); + std::wifstream Input_stream(argv[optind + 1]); + if (Input_stream.fail()) { + std::stringstream what_; + what_ << "can't open INPUT file \"" << argv[optind + 1] << "\""; + throw Exception::apertium_tagger::wifstream_fail(what_); + } + if (argc - optind < 3) { Stream Input(TheFlags, Input_stream, argv[optind + 1]); StreamTagger_.tag(Input, std::wcout); @@ -561,9 +577,14 @@ return; } - std::wofstream Output_stream; - try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); + std::wofstream Output_stream(argv[optind + 2]); + if (Output_stream.fail()) { + std::stringstream what_; + what_ << "can't open OUTPUT file \"" << argv[optind + 2] << "\""; + throw Exception::apertium_tagger::wofstream_fail(what_); + } + Stream Input(TheFlags, Input_stream, argv[optind + 1]); StreamTagger_.tag(Input, Output_stream); } @@ -585,16 +606,26 @@ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); } - std::wifstream TaggedCorpus_stream; - try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); + std::wifstream TaggedCorpus_stream(argv[optind + 1]); + if (TaggedCorpus_stream.fail()) { + std::stringstream what_; + what_ << "can't open TAGGED_CORPUS file \"" << argv[optind + 1] << "\""; + throw Exception::apertium_tagger::wifstream_fail(what_); + } + Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind]); StreamTaggerTrainer_.train(TaggedCorpus); - std::ofstream Serialised_basic_Tagger; - try_open_fstream("SERIALISED_TAGGER", argv[optind], - Serialised_basic_Tagger); + std::ofstream Serialised_basic_Tagger(argv[optind]); + if (Serialised_basic_Tagger.fail()) { + std::stringstream what_; + what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind] + << "\""; + throw Exception::apertium_tagger::ofstream_fail(what_); + } + StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); } @@ -607,11 +638,24 @@ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); } - FILE *Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); + FILE *Serialised_FILE_Tagger = std::fopen(argv[optind], "rb"); + + if (Serialised_FILE_Tagger == NULL) { + std::stringstream what_; + what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind] + << "\" for reading in binary mode"; + throw Exception::apertium_tagger::fopen(what_); + } + FILE_Tagger_.deserialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); + if (std::fclose(Serialised_FILE_Tagger) != 0) { + std::stringstream what_; + what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind] + << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + FILE_Tagger_.set_debug(TheFlags.getDebug()); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); TaggerWord::generate_marks = TheFlags.getMark(); @@ -621,18 +665,54 @@ if (argc - optind < 2) FILE_Tagger_.tagger(stdin, stdout, TheFlags.getFirst()); else { - FILE *Input = try_open_file("INPUT", argv[optind + 1], "r"); + FILE *Input = std::fopen(argv[optind + 1], "r"); + if (Input == NULL) { + std::stringstream what_; + what_ << "can't open INPUT file \"" << argv[optind + 1] + << "\" for reading"; + throw Exception::apertium_tagger::fopen(what_); + } + +#ifdef _MSC_VER + + _setmode(_fileno(Input), _O_U8TEXT); + +#endif // _MSC_VER + if (argc - optind < 3) FILE_Tagger_.tagger(Input, stdout, TheFlags.getFirst()); else { - FILE *Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); + FILE *Output = std::fopen(argv[optind + 2], "w"); + + if (Output == NULL) { + std::stringstream what_; + what_ << "can't open OUTPUT file \"" << argv[optind + 2] + << "\" for writing"; + throw Exception::apertium_tagger::fopen(what_); + } + +#ifdef _MSC_VER + + _setmode(_fileno(Output), _O_U8TEXT); + +#endif // _MSC_VER + FILE_Tagger_.tagger(Input, Output, TheFlags.getFirst()); - try_close_file("OUTPUT", argv[optind + 2], Output); + + if (std::fclose(Output) != 0) { + std::stringstream what_; + what_ << "can't close OUTPUT file \"" << argv[optind + 2] << "\""; + throw Exception::apertium_tagger::fclose(what_); } + } - try_close_file("INPUT", argv[optind + 1], Input); + if (std::fclose(Input) != 0) { + std::stringstream what_; + what_ << "can't close INPUT file \"" << argv[optind + 1] << "\""; + throw Exception::apertium_tagger::fclose(what_); } + } } void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { @@ -644,22 +724,66 @@ throw Exception::apertium_tagger::UnexpectedFileArgumentCount(what_); } - FILE *Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", argv[optind + 1], "rb"); + FILE *Serialised_FILE_Tagger = std::fopen(argv[optind + 1], "rb"); + + if (Serialised_FILE_Tagger == NULL) { + std::stringstream what_; + what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1] + << "\" for reading in binary mode"; + throw Exception::apertium_tagger::fopen(what_); + } + FILE_Tagger_.deserialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", argv[optind + 1], Serialised_FILE_Tagger); + if (std::fclose(Serialised_FILE_Tagger) != 0) { + std::stringstream what_; + what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1] + << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + FILE_Tagger_.set_debug(TheFlags.getDebug()); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind], "r"); + FILE *Corpus = std::fopen(argv[optind], "r"); + + if (Corpus == NULL) { + std::stringstream what_; + what_ << "can't open CORPUS file \"" << argv[optind] << "\" for reading"; + throw Exception::apertium_tagger::fopen(what_); + } + +#ifdef _MSC_VER + + _setmode(_fileno(Corpus), _O_U8TEXT); + +#endif // _MSC_VER + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); - try_close_file("CORPUS", argv[optind], Corpus); - Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", argv[optind + 1], "wb"); + if (std::fclose(Corpus) != 0) { + std::stringstream what_; + what_ << "can't close CORPUS file \"" << argv[optind] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + + Serialised_FILE_Tagger = std::fopen(argv[optind + 1], "wb"); + + if (Serialised_FILE_Tagger == NULL) { + std::stringstream what_; + what_ << "can't open SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1] + << "\" for writing in binary mode"; + throw Exception::apertium_tagger::fopen(what_); + } + FILE_Tagger_.serialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", argv[optind + 1], Serialised_FILE_Tagger); + + if (std::fclose(Serialised_FILE_Tagger) != 0) { + std::stringstream what_; + what_ << "can't close SERIALISED_BASIC_TAGGER file \"" << argv[optind + 1] + << "\""; + throw Exception::apertium_tagger::fclose(what_); + } } void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { @@ -675,25 +799,102 @@ FILE_Tagger_.set_debug(TheFlags.getDebug()); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - FILE *Dictionary = try_open_file("DICTIONARY", argv[optind], "r"); + FILE *Dictionary = std::fopen(argv[optind], "r"); + + if (Dictionary == NULL) { + std::stringstream what_; + what_ << "can't open DICTIONARY file \"" << argv[optind] + << "\" for reading"; + throw Exception::apertium_tagger::fopen(what_); + } + FILE_Tagger_.read_dictionary(Dictionary); - try_close_file("DICTIONARY", argv[optind], Dictionary); - FILE *TaggedCorpus = try_open_file_utf8("TAGGED_CORPUS", argv[optind + 4], "r"); - FILE *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", argv[optind + 5], "r"); + if (std::fclose(Dictionary) != 0) { + std::stringstream what_; + what_ << "can't close DICTIONARY file \"" << argv[optind] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + + FILE *TaggedCorpus = std::fopen(argv[optind + 4], "r"); + + if (TaggedCorpus == NULL) { + std::stringstream what_; + what_ << "can't open TAGGED_CORPUS file \"" << argv[optind + 4] + << "\" for reading"; + throw Exception::apertium_tagger::fopen(what_); + } + + FILE *UntaggedCorpus = std::fopen(argv[optind + 5], "r"); + + if (UntaggedCorpus == NULL) { + std::stringstream what_; + what_ << "can't open UNTAGGED_CORPUS file \"" << argv[optind + 5] + << "\" for reading"; + throw Exception::apertium_tagger::fopen(what_); + } + +#ifdef _MSC_VER + + _setmode(_fileno(TaggedCorpus), _O_U8TEXT); + _setmode(_fileno(UntaggedCorpus), _O_U8TEXT); + +#endif // _MSC_VER + FILE_Tagger_.init_probabilities_from_tagged_text_(TaggedCorpus, UntaggedCorpus); - try_close_file("TAGGED_CORPUS", argv[optind + 4], TaggedCorpus); - try_close_file("UNTAGGED_CORPUS", argv[optind + 5], UntaggedCorpus); - FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind + 1], "r"); + if (std::fclose(TaggedCorpus) != 0) { + std::stringstream what_; + what_ << "can't close TAGGED_CORPUS file \"" << argv[optind + 4] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + + if (std::fclose(UntaggedCorpus) != 0) { + std::stringstream what_; + what_ << "can't close UNTAGGED_CORPUS file \"" << argv[optind + 5] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + + FILE *Corpus = std::fopen(argv[optind + 1], "r"); + + if (Corpus == NULL) { + std::stringstream what_; + what_ << "can't open CORPUS file \"" << argv[optind + 1] + << "\" for reading"; + throw Exception::apertium_tagger::fopen(what_); + } + +#ifdef _MSC_VER + + _setmode(_fileno(Corpus), _O_U8TEXT); + +#endif // _MSC_VER + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); - try_close_file("CORPUS", argv[optind + 1], UntaggedCorpus); - FILE *Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", argv[optind + 3], "wb"); - FILE_Tagger_.serialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", argv[optind + 3], UntaggedCorpus); + if (std::fclose(Corpus) != 0) { + std::stringstream what_; + what_ << "can't close CORPUS file \"" << argv[optind + 1] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + + FILE *Stream_ = std::fopen(argv[optind + 3], "wb"); + + if (Stream_ == NULL) { + std::stringstream what_; + what_ << "can't open STREAM file \"" << argv[optind + 3] + << "\" for writing in binary mode"; + throw Exception::apertium_tagger::fopen(what_); + } + + FILE_Tagger_.serialise(Stream_); + + if (std::fclose(Stream_) != 0) { + std::stringstream what_; + what_ << "can't close STREAM file \"" << argv[optind + 3] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } } void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { @@ -709,19 +910,63 @@ FILE_Tagger_.set_debug(TheFlags.getDebug()); TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - FILE *Dictionary = try_open_file("DICTIONARY", argv[optind], "r"); + FILE *Dictionary = std::fopen(argv[optind], "r"); + + if (Dictionary == NULL) { + std::stringstream what_; + what_ << "can't open DICTIONARY file \"" << argv[optind] + << "\" for reading"; + throw Exception::apertium_tagger::fopen(what_); + } + FILE_Tagger_.read_dictionary(Dictionary); - try_close_file("DICTIONARY", argv[optind], Dictionary); - FILE *Corpus = try_open_file_utf8("CORPUS", argv[optind + 1], "r"); + if (std::fclose(Dictionary) != 0) { + std::stringstream what_; + what_ << "can't close DICTIONARY file \"" << argv[optind] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + + FILE *Corpus = std::fopen(argv[optind + 1], "r"); + + if (Corpus == NULL) { + std::stringstream what_; + what_ << "can't open CORPUS file \"" << argv[optind + 1] + << "\" for reading"; + throw Exception::apertium_tagger::fopen(what_); + } + +#ifdef _MSC_VER + + _setmode(_fileno(Corpus), _O_U8TEXT); + +#endif // _MSC_VER + FILE_Tagger_.init_probabilities_kupiec_(Corpus); FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); - try_close_file("CORPUS", argv[optind + 1], Corpus); - FILE *Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", argv[optind + 3], "wb"); - FILE_Tagger_.serialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", argv[optind + 3], Serialised_FILE_Tagger); + if (std::fclose(Corpus) != 0) { + std::stringstream what_; + what_ << "can't close CORPUS file \"" << argv[optind + 1] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } + + FILE *Stream_ = std::fopen(argv[optind + 3], "wb"); + + if (Stream_ == NULL) { + std::stringstream what_; + what_ << "can't open STREAM file \"" << argv[optind + 3] + << "\" for writing in binary mode"; + throw Exception::apertium_tagger::fopen(what_); + } + + FILE_Tagger_.serialise(Stream_); + + if (std::fclose(Stream_) != 0) { + std::stringstream what_; + what_ << "can't close STREAM file \"" << argv[optind + 3] << "\""; + throw Exception::apertium_tagger::fclose(what_); + } } } Index: trunk/apertium/apertium/exception.h =================================================================== --- trunk/apertium/apertium/exception.h (revision 69620) +++ trunk/apertium/apertium/exception.h (revision 69621) @@ -40,9 +40,12 @@ EXCEPTION(deserialise) EXCEPTION(fclose) EXCEPTION(fopen) -EXCEPTION(open_stream_fail) +EXCEPTION(ifstream_fail) +EXCEPTION(ofstream_fail) EXCEPTION(optarg_eq_NULL) EXCEPTION(str_end_not_eq_NULL) +EXCEPTION(wifstream_fail) +EXCEPTION(wofstream_fail) EXCEPTION(ERANGE_) EXCEPTION(InvalidArgument) EXCEPTION(InvalidOption) Index: trunk/apertium/apertium/hmm.cc =================================================================== --- trunk/apertium/apertium/hmm.cc (revision 69620) +++ trunk/apertium/apertium/hmm.cc (revision 69621) @@ -710,7 +710,7 @@ TaggerWord *word=NULL; TTag tag; - set ambg_class_tags, tags, pretags; + set tags, pretags; set ::iterator itag, jtag; double prob, loli, x; @@ -750,9 +750,9 @@ if (tags.size()==0) // This is an unknown word tags = tdhmm.getOpenClass(); - ambg_class_tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug); + tags = require_similar_ambiguity_class(tdhmm, tags, *word, debug); - k = output[ambg_class_tags]; //Ambiguity class the word belongs to + k = output[tags]; //Ambiguity class the word belongs to #ifdef __GNUC__ clear_array_double(alpha[nwpend%2], N); Index: trunk/apertium/apertium/lswpost.cc =================================================================== --- trunk/apertium/apertium/lswpost.cc (revision 69620) +++ trunk/apertium/apertium/lswpost.cc (revision 69621) @@ -342,12 +342,14 @@ word_left->set_show_sf(show_sf); tags_left = word_left->get_tags(); // tags left - warn_absent_ambiguity_class(tdlsw, tags_left, *word_left, debug); + tags_left = require_similar_ambiguity_class(tdlsw, tags_left, *word_left, debug); + word_mid = morpho_stream.get_next_word(); // word mid word_mid->set_show_sf(show_sf); tags_mid = word_mid->get_tags(); // tags mid - warn_absent_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); + tags_mid = require_similar_ambiguity_class(tdlsw, tags_mid, *word_mid, debug); + if (morpho_stream.getEndOfFile()) { delete word_left; delete word_mid; @@ -359,8 +361,9 @@ wstring micad; while (word_right) { + tags_right = word_right->get_tags(); - warn_absent_ambiguity_class(tdlsw, tags_right, *word_right, debug); + tags_right = require_similar_ambiguity_class(tdlsw, tags_right, *word_right, debug); double max = -1; TTag tag_max = *tags_mid.begin(); Index: trunk/apertium/apertium/tagger_utils.cc =================================================================== --- trunk/apertium/apertium/tagger_utils.cc (revision 69620) +++ trunk/apertium/apertium/tagger_utils.cc (revision 69621) @@ -20,8 +20,6 @@ #include #include -#include -#include #include #ifdef _MSC_VER #define wcstok wcstok_s @@ -169,18 +167,27 @@ set tagger_utils::find_similar_ambiguity_class(TaggerData &td, set &c) { - set &ret = td.getOpenClass(); + int size_ret = -1; + set ret = td.getOpenClass(); // return open-class as default, if no better is found. + bool skip_class; Collection &output = td.getOutput(); - for (int k=0; k &ambg_class = output[k]; - if (ambg_class.size() >= ret.size()) { - continue; + for(int k=0; k((int)size_ret)) && (((int)output[k].size())<((int)c.size()))) { + skip_class = false; + // Test if output[k] is a subset of class + for(set::const_iterator it=output[k].begin(); it!=output[k].end(); it++) { + if (c.find(*it)==c.end()) { + skip_class = true; //output[k] is not a subset of class + break; } - if (includes(ambg_class.begin(), ambg_class.end(), c.begin(), c.end())) { - ret = ambg_class; } + if (!skip_class) { + size_ret = output[k].size(); + ret = output[k]; } + } + } return ret; } @@ -201,7 +208,10 @@ } } -static void _warn_absent_ambiguity_class(TaggerWord &word) { +set +tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { + if (td.getOutput().has_not(tags)) { + if (debug) { wstring errors; errors = L"A new ambiguity class was found. \n"; errors += L"Retraining the tagger is necessary so as to take it into account.\n"; @@ -208,13 +218,6 @@ errors += L"Word '" + word.get_superficial_form() + L"'.\n"; errors += L"New ambiguity class: " + word.get_string_tags() + L"\n"; wcerr << L"Error: " << errors; -} - -set -tagger_utils::require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { - if (td.getOutput().has_not(tags)) { - if (debug) { - _warn_absent_ambiguity_class(word); } return find_similar_ambiguity_class(td, tags); } @@ -221,13 +224,6 @@ return tags; } -void -tagger_utils::warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug) { - if (td.getOutput().has_not(tags) && debug) { - _warn_absent_ambiguity_class(word); - } -} - template ostream& operator<< (ostream& os, const map & f){ typename map ::const_iterator it; Index: trunk/apertium/apertium/tagger_utils.h =================================================================== --- trunk/apertium/apertium/tagger_utils.h (revision 69620) +++ trunk/apertium/apertium/tagger_utils.h (revision 69621) @@ -91,9 +91,6 @@ * & prints a warning if debug */ set require_similar_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); -/** Just prints a warning if debug */ -void warn_absent_ambiguity_class(TaggerData &td, set &tags, TaggerWord &word, bool debug); - wstring trim(wstring s); }; Index: trunk/apertium/configure.ac =================================================================== --- trunk/apertium/configure.ac (revision 69620) +++ trunk/apertium/configure.ac (revision 69621) @@ -186,4 +186,4 @@ AM_CONDITIONAL([WINDOWS], [test x$version_type = xwindows]) AS_IF([test x$version_type = xwindows], [AC_DEFINE(HAVE_GETOPT_LONG,0)], []) -AC_OUTPUT([Makefile apertium.pc apertium/Makefile tests/Makefile tests/tagger/Makefile]) +AC_OUTPUT([Makefile apertium.pc apertium/Makefile]) Index: trunk/apertium/tests/Makefile.am =================================================================== --- trunk/apertium/tests/Makefile.am (revision 69620) +++ trunk/apertium/tests/Makefile.am (nonexistent) @@ -1 +0,0 @@ -SUBDIRS = tagger Index: trunk/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc =================================================================== --- trunk/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc (revision 69620) +++ trunk/apertium/tests/tagger/test_find_similar_ambiguity_classes.cc (nonexistent) @@ -1,61 +0,0 @@ -#include "apertium/utf_converter.h" -#include "apertium/tagger_utils.h" -#include "apertium/tagger_data_hmm.h" -#include "apertium/tagger_data.h" -#include -#include -#include -#include - -void print_ambiguity_class(const vector &array_tags, const set &abgset) -{ - unsigned int j; - set::const_iterator abgseti; - for (abgseti=abgset.begin(), j=0; abgseti!=abgset.end(); abgseti++, j++) { - wcout << array_tags[*abgseti]; - if (j < abgset.size() - 1) { - wcout << " "; - } - } -} - -void find_similar_ambiguity_class_io(TaggerData &td) -{ - vector &array_tags = td.getArrayTags(); - wstring line = L""; - getline(wcin, line, L'\n'); - - wstringstream line_stream(line); - set ambiguity_class; - wstring tag_name; - while (line_stream >> tag_name) { - vector::iterator it; - it = find(array_tags.begin(), array_tags.end(), tag_name); - if (it == array_tags.end()) { - wcerr << L"Tag not in model: " << tag_name << L'\n'; - exit(-3); - } - ambiguity_class.insert(it - array_tags.begin()); - } - set similar_ambiguity_class = tagger_utils::find_similar_ambiguity_class(td, ambiguity_class); - print_ambiguity_class(array_tags, similar_ambiguity_class); -} - -int main(int argc, char *argv[]) -{ - if (argc < 2) { - cerr<<"Usage: "<\n"; - exit(-1); - } - char* probfile = argv[1]; - TaggerDataHMM tagger_data_hmm; - FILE* fin = fopen(probfile, "r"); - if (!fin) { - cerr<<"Error: cannot open file '"< 0: - print("run " + " ".join(args[0])) - return f(*args, **kwargs) - return inner - - -def trace_plus_unicode(f): - return functools.partial(trace_dec(f), universal_newlines=True) - -check_call = trace_plus_unicode(check_call) -check_output = trace_plus_unicode(check_output) -check_stderr = trace_plus_unicode(check_stderr) - -# Test files -DIC = """ -^the/the$ -^books/book/book$ -^has/have$ -^booked/book/book$ -^close/close/close/close/close/close$ -^cat/cat$ -^room/room$ -^red/red$ -^./.$ -""".strip() - -TSX = """ - - - - - - - - - - - - - - - - - - - - -""".strip() - -TRAIN_NO_PROBLEM_UNTAGGED = """ -^The/the$ -^cat/cat$ -^books/book/book$ -^the/the$ -^room/room$ -^./.$ - -^The/the$ -^red/red$ -^cat/cat$ -^books/book/book$ -^the/the$ -^red/red$ -^room/room$ -^./.$ - -^The/the$ -^red/red$ -^cat/cat$ -^books/book/book$ -^the/the$ -^room/room$ -^./.$ -""".strip() - -TRAIN_NO_PROBLEM_TAGGED = """ -^The/the$ -^cat/cat$ -^books/book$ -^the/the$ -^room/room$ -^./.$ - -^The/the$ -^red/red$ -^cat/cat$ -^books/book$ -^the/the$ -^red/red$ -^room/room$ -^./.$ - -^The/the$ -^red/red$ -^cat/cat$ -^books/book$ -^the/the$ -^room/room$ -^./.$ -""".strip() - -TRAIN_CAT_TO_BE_A_VERB_UNTAGGED = """ -^The/The$ -^falling/fall/fall/fall$ -^cat/cat$ -^has/have$ -^booked/book/book$ -^books/book/book$ -^./.$ - -^Close/close/close/close/close/close$ -^the/the$ -^books/book/book$ -^./.$ - -^The/the$ -^falling/fall/fall/fall$ -^cat/cat$ -^has/have$ -^books/book/book$ -^./.$ -""".strip() - -TRAIN_CAT_TO_BE_A_VERB_TAGGED = """ -^The/The$ -^falling/fall$ -^cat/cat$ -^has/have$ -^booked/book$ -^books/book$ -^./.$ - -^Close/close$ -^the/the$ -^books/book$ -^./.$ - -^The/the$ -^falling/fall$ -^cat/cat$ -^has/have$ -^books/book$ -^./.$ -""".strip() - -TEST_SUCCESS = """ -^The/the$ -^cat/cat$ -^books/book/book$ -^the/the$ -^room/room$ -^./.$ -""".strip() - -TEST_NEW_AMBG_CLASS = """ -^The/the$ -^cat/cat/cat$ -^books/book/book$ -^the/the$ -^room/room$ -^./.$ -""".strip() - -# Expected strings -EXPECTED_SUBST = """ -Error: A new ambiguity class was found. -Retraining the tagger is necessary so as to take it into account. -Word 'cat'. -New ambiguity class: {NOUN,ADJ} -""".strip().split("\n") - - -# Tests -class AmbiguityClassTest(unittest.TestCase): - def setUp(self): - self.tsx_fn = tmp(TSX) - self.dic_fn = tmp(DIC) - - def changing_class_impl(self, flags, model_fn): - test1 = tmp(TEST_SUCCESS) - test2 = tmp(TEST_NEW_AMBG_CLASS) - success_stderr = check_stderr( - [APERTIUM_TAGGER, '-d'] + flags + - ['-g', model_fn, test1], - stdout=DEVNULL) - self.assertEqual(success_stderr.strip(), "") - subst_stderr = check_stderr( - [APERTIUM_TAGGER, '-d'] + flags + - ['-g', model_fn, test2], - stdout=DEVNULL) - subst_stderr = [line.strip() - for line in subst_stderr.strip().split("\n")] - self.assertEqual(subst_stderr, EXPECTED_SUBST) - ambg_class = check_output( - [rel('test-find-similar-ambiguity-class'), model_fn], - input="NOUN ADJ\n") - substituted_class = set(ambg_class.split(" ")) - # Should get open class - self.assertSetEqual(substituted_class, set(("VERB", "NOUN", "ADJ"))) - - def test_changing_class_hmm_sup(self): - model_fn = tmp("") - untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) - tagged = tmp(TRAIN_NO_PROBLEM_TAGGED) - check_call( - [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, - model_fn, tagged, untagged]) - self.changing_class_impl([], model_fn) - - def test_changing_class_hmm_unsup(self): - model_fn = tmp("") - untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) - check_call( - [APERTIUM_TAGGER, '-t', '1', self.dic_fn, untagged, self.tsx_fn, - model_fn]) - self.changing_class_impl([], model_fn) - - def test_changing_class_sliding_window(self): - model_fn = tmp("") - untagged = tmp(TRAIN_NO_PROBLEM_UNTAGGED) - check_call( - [APERTIUM_TAGGER, '--sliding-window', '-t', '1', self.dic_fn, - untagged, self.tsx_fn, model_fn]) - self.changing_class_impl(['--sliding-window'], model_fn) - - def test_cat_is_a_verb(self): - model_fn = tmp("") - untagged = tmp(TRAIN_CAT_TO_BE_A_VERB_UNTAGGED) - tagged = tmp(TRAIN_CAT_TO_BE_A_VERB_TAGGED) - new_ambg_class = tmp(TEST_NEW_AMBG_CLASS) - check_call( - [APERTIUM_TAGGER, '-s', '0', self.dic_fn, untagged, self.tsx_fn, - model_fn, tagged, untagged]) - subst_stdout = check_output( - [APERTIUM_TAGGER, '-d', '-g', model_fn, new_ambg_class], - stderr=DEVNULL) - acceptable = False - for line in subst_stdout.split("\n"): - if (line.startswith('^cat') and ('' in line or '' in line)): - acceptable = True - self.assertTrue( - acceptable, - "'cat' must be output and tagged as an adjective or a noun.\n" + - "Actual output:\n{}".format(subst_stdout)) Property changes on: trunk/apertium/tests/tagger/__init__.py ___________________________________________________________________ Deleted: svn:executable ## -1 +0,0 ## -* \ No newline at end of property Index: trunk/apertium/tests/tagger/Makefile.am =================================================================== --- trunk/apertium/tests/tagger/Makefile.am (revision 69620) +++ trunk/apertium/tests/tagger/Makefile.am (nonexistent) @@ -1,14 +0,0 @@ -library_includedir = $(includedir)/$(GENERIC_LIBRARY_NAME)-$(GENERIC_API_VERSION)/$(GENERIC_LIBRARY_NAME) - -bin_PROGRAMS = test-find-similar-ambiguity-class -bin_SCRIPTS = $(GENERATEDSCRIPTS) - -AM_CPPFLAGS = -I$(top_srcdir) - -apertiumdir = $(prefix)/share/apertium -apertiuminclude = $(prefix)/include/apertium-$(GENERIC_API_VERSION) -apertiumlib = $(prefix)/lib -apertiumsysconf = $(prefix)/etc/apertium - -test_find_similar_ambiguity_class_SOURCES = test_find_similar_ambiguity_classes.cc -test_find_similar_ambiguity_class_LDADD = -L$(top_srcdir)/$(GENERIC_LIBRARY_NAME)/.libs/ $(APERTIUM_LIBS) -l$(GENERIC_LIBRARY_NAME)$(GENERIC_MAJOR_VERSION) Index: trunk/apertium/tests/run_tests.py =================================================================== --- trunk/apertium/tests/run_tests.py (revision 69620) +++ trunk/apertium/tests/run_tests.py (revision 69621) @@ -6,12 +6,11 @@ import unittest import pretransfer -import tagger if __name__ == "__main__": os.chdir(os.path.dirname(__file__)) failures = 0 - for module in [pretransfer, tagger]: + for module in [pretransfer]: suite = unittest.TestLoader().loadTestsFromModule(module) res = unittest.TextTestRunner(verbosity = 2).run(suite) failures += len(res.failures)