commit fedc754345a310df9a8a7adafd450b20e9791eab Author: Lokendra Singh Date: Mon Jul 15 02:36:46 2019 +0530 Swig wrapper for apertium-tagger (#52) * renamed: apertium_tagger.h -> tagger.h * Split: apertium_tagger.cc to tagger.h & tagger.cc * Set optind in constructor * Wrapper for apertium-tagger * include headers in proper location .h has headers required for prototyping .cc has headers required for definition * Added comments for individual dependency * Renamed: apertium_tag -> tag * Added: pretransfer.h: copyright notice * Renamed: apertium_core.tag -> apertium_core.tagger * Updated comment * Sync: apertium/master * tagger.h compatible with swig 3.0.12 Removed: const qualifier from Apertium::basic_Tagger::Flags and derived classes * Build wrapper on travis diff --git a/.travis.yml b/.travis.yml index 095963c..6af4cdf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,9 +5,9 @@ compiler: - gcc before_install: - wget https://apertium.projectjj.com/apt/install-nightly.sh -O - | sudo bash - - sudo apt-get install -y lttoolbox-dev libxml2-utils xsltproc libpcre3 + - sudo apt-get install -y lttoolbox-dev libxml2-utils xsltproc libpcre3 swig script: - - ./autogen.sh + - ./autogen.sh --enable-python-bindings - ./configure - make - make test diff --git a/apertium/Makefile.am b/apertium/Makefile.am index 377c1a0..0893e3b 100644 --- a/apertium/Makefile.am +++ b/apertium/Makefile.am @@ -4,7 +4,6 @@ h_sources = a.h \ align.h \ analysis.h \ apertium_re.h \ - apertium_tagger.h \ basic_5_3_1_tagger.h \ basic_5_3_2_tagger.h \ basic_5_3_3_tagger.h \ @@ -58,6 +57,7 @@ h_sources = a.h \ string_utils.h \ shell_utils.h \ tag.h \ + tagger.h \ tagger_data.h \ tagger_data_hmm.h \ tagger_data_lsw.h \ @@ -147,6 +147,7 @@ cc_sources = a.cc \ string_utils.cc \ shell_utils.cc \ tag.cc \ + tagger.cc \ tagger_data.cc \ tagger_data_hmm.cc \ tagger_data_lsw.cc \ diff --git a/apertium/apertium_tagger.cc b/apertium/apertium_tagger.cc index 3703100..e726257 100644 --- a/apertium/apertium_tagger.cc +++ b/apertium/apertium_tagger.cc @@ -13,32 +13,7 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, see . -#include "apertium_tagger.h" - -#include "apertium_config.h" - -#include "align.h" -#include "basic_exception_type.h" -#include "basic_stream_tagger.h" -#include "basic_stream_tagger_trainer.h" -#include "basic_tagger.h" -#include "err_exception.h" -#include "exception.h" -#include "file_tagger.h" -#include "linebreak.h" -#include "stream_5_3_1_tagger.h" -#include "stream_5_3_1_tagger_trainer.h" -#include "stream_5_3_2_tagger.h" -#include "stream_5_3_2_tagger_trainer.h" -#include "stream_5_3_3_tagger.h" -#include "stream_5_3_3_tagger_trainer.h" -#include -#include -#include -#include -#include - -#include +#include #include "getopt_long.h" #include @@ -54,768 +29,10 @@ #include #include -namespace Apertium { -using namespace ShellUtils; - -/** Top level argument parsing */ - -apertium_tagger::apertium_tagger(int &argc, char **&argv) - : argc(argc), argv(argv), The_val(), nonoptarg(), - - The_indexptr(), FunctionTypeTypeOption_indexptr(), - FunctionTypeOption_indexptr(), - - TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), - TheFunctionTypeOptionArgument(0), TheFlags() { - try { - while (true) { - The_val = getopt_long(argc, argv, "bdfegmpr:s:t:u:wxz", longopts, &The_indexptr); - - if (The_val == -1) - break; - - set_indexptr(); - - switch (The_val) { - case 'b': - flagOptionCase(&basic_Tagger::Flags::getSentSeg, - &basic_Tagger::Flags::setSentSeg); - break; - case 'd': - flagOptionCase(&basic_Tagger::Flags::getDebug, - &basic_Tagger::Flags::setDebug); - break; - case 'e': - flagOptionCase(&basic_Tagger::Flags::getSkipErrors, - &basic_Tagger::Flags::setSkipErrors); - break; - case 'f': - flagOptionCase(&basic_Tagger::Flags::getFirst, - &basic_Tagger::Flags::setFirst); - break; - case 'm': - flagOptionCase(&basic_Tagger::Flags::getMark, - &basic_Tagger::Flags::setMark); - break; - case 'p': - flagOptionCase(&basic_Tagger::Flags::getShowSuperficial, - &basic_Tagger::Flags::setShowSuperficial); - break; - case 'z': - flagOptionCase(&basic_Tagger::Flags::getNullFlush, - &basic_Tagger::Flags::setNullFlush); - break; - case 'u': - functionTypeTypeOptionCase(Unigram); - - if (std::strncmp(optarg, "1", sizeof "1" - 1) == 0) { - TheUnigramType = Stream_5_3_1; - break; - } - - if (std::strncmp(optarg, "2", sizeof "2" - 1) == 0) { - TheUnigramType = Stream_5_3_2; - break; - } - - if (std::strncmp(optarg, "3", sizeof "3" - 1) == 0) { - TheUnigramType = Stream_5_3_3; - break; - } - - { - std::stringstream what_; - what_ << "invalid argument '" << optarg << "' for '--unigram'\n" - "Valid arguments are:\n" - " - '1'\n" - " - '2'\n" - " - '3'"; - throw Exception::apertium_tagger::InvalidArgument(what_); - } - break; - case 'w': - functionTypeTypeOptionCase(SlidingWindow); - break; - case 'x': - functionTypeTypeOptionCase(Perceptron); - break; - case 'g': - functionTypeOptionCase(Tagger); - break; - case 'r': - functionTypeOptionCase(Retrain); - getIterationsArgument(); - break; - case 's': - functionTypeOptionCase(Supervised); - getIterationsArgument(); - break; - case 't': - functionTypeOptionCase(Train); - getIterationsArgument(); - break; - case 'h': - help(); - return; - default: - throw err_Exception(); - } - } - - if (!TheFunctionType) { - help(); - return; - } - - nonoptarg = argc - optind; - - switch (*TheFunctionType) { - case Tagger: - if (!TheFunctionTypeType) { - HMM HiddenMarkovModelTagger_; - g_FILE_Tagger(HiddenMarkovModelTagger_); - break; - } - - switch (*TheFunctionTypeType) { - case Unigram: { - switch (*TheUnigramType) { - case Stream_5_3_1: { - Stream_5_3_1_Tagger Stream_5_3_1_Tagger_(TheFlags); - g_StreamTagger(Stream_5_3_1_Tagger_); - } break; - case Stream_5_3_2: { - Stream_5_3_2_Tagger Stream_5_3_2_Tagger_(TheFlags); - g_StreamTagger(Stream_5_3_2_Tagger_); - } break; - case Stream_5_3_3: { - Stream_5_3_3_Tagger Stream_5_3_3_Tagger_(TheFlags); - g_StreamTagger(Stream_5_3_3_Tagger_); - } break; - default: - std::abort(); - } - } break; - case SlidingWindow: { - LSWPoST SlidingWindowTagger_; - g_FILE_Tagger(SlidingWindowTagger_); - } break; - case Perceptron: { - PerceptronTagger perceptron(TheFlags); - g_StreamTagger(perceptron); - } break; - default: - std::abort(); - } - - break; - case Retrain: - if (!TheFunctionTypeType) { - HMM HiddenMarkovModelTagger_; - r_FILE_Tagger(HiddenMarkovModelTagger_); - break; - } - - switch (*TheFunctionTypeType) { - case Unigram: { - std::stringstream what_; - what_ << "invalid option -- 'u'"; - throw Exception::apertium_tagger::InvalidOption(what_); - } - case SlidingWindow: { - LSWPoST SlidingWindowTagger_; - r_FILE_Tagger(SlidingWindowTagger_); - } break; - default: - std::abort(); - } - - break; - case Supervised: - if (!TheFunctionTypeType) { - HMM HiddenMarkovModelTagger_; - s_FILE_Tagger(HiddenMarkovModelTagger_); - break; - } - - switch (*TheFunctionTypeType) { - case Unigram: { - switch (*TheUnigramType) { - case Stream_5_3_1: { - Stream_5_3_1_TaggerTrainer Stream_5_3_1_TaggerTrainer_(TheFlags); - s_StreamTaggerTrainer(Stream_5_3_1_TaggerTrainer_); - } break; - case Stream_5_3_2: { - Stream_5_3_2_TaggerTrainer Stream_5_3_2_TaggerTrainer_(TheFlags); - s_StreamTaggerTrainer(Stream_5_3_2_TaggerTrainer_); - } break; - case Stream_5_3_3: { - Stream_5_3_3_TaggerTrainer Stream_5_3_3_TaggerTrainer_(TheFlags); - s_StreamTaggerTrainer(Stream_5_3_3_TaggerTrainer_); - } break; - default: - std::abort(); - } - } break; - case SlidingWindow: { - std::stringstream what_; - what_ << "invalid option -- 'w'"; - throw Exception::apertium_tagger::InvalidOption(what_); - } break; - case Perceptron: { - PerceptronTagger perceptron(TheFlags); - s_StreamTaggerTrainer(perceptron); - } break; - default: - std::abort(); - } - - break; - case Train: - if (!TheFunctionTypeType) { - HMM HiddenMarkovModelTagger_; - t_FILE_Tagger(HiddenMarkovModelTagger_); - break; - } - - switch (*TheFunctionTypeType) { - case Unigram: { - std::stringstream what_; - what_ << "invalid option -- 'u'"; - throw Exception::apertium_tagger::InvalidOption(what_); - } - case SlidingWindow: { - LSWPoST SlidingWindowTagger_; - t_FILE_Tagger(SlidingWindowTagger_); - } break; - default: - std::abort(); - } - - break; - default: - std::abort(); - } - } catch (const basic_ExceptionType &basic_ExceptionType_) { - std::wcerr << "apertium-tagger: " << basic_ExceptionType_.what() << std::endl; - throw err_Exception(); - } -} - -void apertium_tagger::help() { - - std::wcerr << -"Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" -" [INPUT \\\n" -" [OUTPUT]]\n" -"\n" -" or: apertium-tagger [OPTION]... -r ITERATIONS \\\n" -" CORPUS \\\n" -" SERIALISED_TAGGER\n" -"\n" -" or: apertium-tagger [OPTION]... -s ITERATIONS \\\n" -" DICTIONARY \\\n" -" CORPUS \\\n" -" TAGGER_SPECIFICATION \\\n" -" SERIALISED_TAGGER \\\n" -" TAGGED_CORPUS \\\n" -" UNTAGGED_CORPUS\n" -"\n" -" or: apertium-tagger [OPTION]... -s 0 \\\n" -" DICTIONARY \\\n" -" TAGGER_SPECIFICATION \\\n" -" SERIALISED_TAGGER \\\n" -" TAGGED_CORPUS \\\n" -" UNTAGGED_CORPUS\n" -"\n" -" or: apertium-tagger [OPTION]... -s 0 \\\n" -" -u MODEL \\\n" -" SERIALISED_TAGGER \\\n" -" TAGGED_CORPUS\n" -"\n" -" or: apertium-tagger [OPTION]... -t ITERATIONS \\\n" -" DICTIONARY \\\n" -" CORPUS \\\n" -" TAGGER_SPECIFICATION \\\n" -" SERIALISED_TAGGER\n" -"\n" -"Mandatory arguments to long options are mandatory for short options too.\n" -"\n"; - - std::vector > options_description_; - options_description_.push_back(std::make_pair("-d, --debug", "with -g, print error messages about the input")); - options_description_.push_back(std::make_pair("-f, --first", "with -g, reorder each lexical unit's analyses so that the chosen one is first")); - options_description_.push_back(std::make_pair("-m, --mark", "with -g, mark disambiguated lexical units")); - options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); - options_description_.push_back(std::make_pair("-z, --null-flush", "with -g, flush the output after getting each null character")); - align::align_(options_description_); - std::wcerr << '\n'; - options_description_.clear(); - options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from ")); - align::align_(options_description_); - std::wcerr << '\n'; - options_description_.clear(); - options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); - options_description_.push_back(std::make_pair("-x, --perceptron", "use the averaged perceptron algorithm")); - options_description_.push_back(std::make_pair("-e, --skip-on-error", "with -xs, ignore certain types of errors with the training corpus")); - align::align_(options_description_); - std::wcerr << '\n'; - options_description_.clear(); - options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); - align::align_(options_description_); - std::wcerr << '\n'; - options_description_.clear(); - options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); - options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); - options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); - align::align_(options_description_); - std::wcerr << '\n'; - options_description_.clear(); - options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); - align::align_(options_description_); -} - -const struct option apertium_tagger::longopts[] = { - {"help", no_argument, 0, 'h'}, - {"sent-seg", no_argument, 0, 'b'}, - {"debug", no_argument, 0, 'd'}, - {"skip-on-error", no_argument, 0, 'e'}, - {"first", no_argument, 0, 'f'}, - {"mark", no_argument, 0, 'm'}, - {"show-superficial", no_argument, 0, 'p'}, - {"null-flush", no_argument, 0, 'z'}, - {"unigram", required_argument, 0, 'u'}, - {"sliding-window", no_argument, 0, 'w'}, - {"perceptron", no_argument, 0, 'x'}, - {"tagger", no_argument, 0, 'g'}, - {"retrain", required_argument, 0, 'r'}, - {"supervised", required_argument, 0, 's'}, - {"train", required_argument, 0, 't'}, - {0, 0, 0, 0}}; - -/** Utilities */ - -std::string apertium_tagger::option_string(const int &indexptr_) { - return option_string(longopts[indexptr_]); -} - -std::string apertium_tagger::option_string(const struct option &option_) { - std::stringstream option_string_; - option_string_ << "--" << option_.name; - return option_string_.str(); -} - -void apertium_tagger::locale_global_() { - -#if defined __clang__ - - std::locale::global(std::locale("")); - -#else -#if defined __APPLE__ - - LtLocale::tryToSetLocale(); - -#else - - std::locale::global(std::locale("")); - -#endif // defined __APPLE__ -#endif // defined __clang__ -} - -void apertium_tagger::set_indexptr() { - if (The_val == longopts[The_indexptr].val) - return; - - for (std::size_t longopts_Index = 0; longopts[longopts_Index].val != 0; - ++longopts_Index) { - if (The_val == longopts[longopts_Index].val) { - The_indexptr = longopts_Index; - return; - } - } -} - -void apertium_tagger::flagOptionCase( - bool (basic_Tagger::Flags::*GetFlag)() const, - void (basic_Tagger::Flags::*SetFlag)(const bool &)) { - if ((TheFlags.*GetFlag)()) { - std::stringstream what_; - what_ << "unexpected '" << option_string() << "' following '" - << option_string() << '\''; - throw Exception::apertium_tagger::UnexpectedFlagOption(what_); - } - - (TheFlags.*SetFlag)(true); -} - -std::string apertium_tagger::option_string() { - return option_string(The_indexptr); -} - -void apertium_tagger::functionTypeTypeOptionCase( - const FunctionTypeType &FunctionTypeType_) { - if (FunctionTypeTypeOption_indexptr) { - std::stringstream what_; - what_ << "unexpected '" << option_string() << "' following '" - << option_string(*FunctionTypeTypeOption_indexptr) - << '\''; - throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); - } - - TheFunctionTypeType = FunctionTypeType_; - FunctionTypeTypeOption_indexptr = The_indexptr; -} - -void apertium_tagger::functionTypeOptionCase( - const FunctionType &FunctionType_) { - if (FunctionTypeOption_indexptr) { - std::stringstream what_; - what_ << "unexpected '" << option_string() << "' following '" - << option_string(*FunctionTypeOption_indexptr) - << '\''; - throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); - } - - TheFunctionType = FunctionType_; - FunctionTypeOption_indexptr = The_indexptr; -} - -void apertium_tagger::getIterationsArgument() { - try { - TheFunctionTypeOptionArgument = optarg_unsigned_long("ITERATIONS"); - } catch (const ExceptionType &ExceptionType_) { - std::stringstream what_; - what_ << "invalid argument '" << optarg << "' for '" << option_string() - << '\''; - throw Exception::apertium_tagger::InvalidArgument(what_); - } -} - -static unsigned long parse_unsigned_long(const char *metavar, const char *val) { - char *str_end; - errno = 0; - unsigned long N_0 = std::strtoul(val, &str_end, 10); - - if (*str_end != '\0') { - std::stringstream what_; - what_ << "can't convert " << metavar << " \"" << val << "\" to unsigned long"; - throw Exception::apertium_tagger::str_end_not_eq_NULL(what_); - } - - if (*val == '\0') { - std::stringstream what_; - what_ << "can't convert " << metavar << " of size 1 \"\" to unsigned long"; - throw Exception::apertium_tagger::optarg_eq_NULL(what_); - } - - if (errno == ERANGE) { - std::stringstream what_; - what_ << "can't convert " << metavar << " \"" << val - << "\" to unsigned long, not in unsigned long range"; - throw Exception::apertium_tagger::ERANGE_(what_); - } - - return N_0; -} - -unsigned long apertium_tagger::optarg_unsigned_long(const char *metavar) { - return parse_unsigned_long(metavar, optarg); -} - -void apertium_tagger::get_file_arguments( - bool get_crp_fn, - char **DicFn, char **CrpFn, - char **TaggedFn, char **UntaggedFn, - char **TsxFn, char **ProbFn) { - if (*TheFunctionType != Retrain) { - *DicFn = argv[optind++]; - } - if (get_crp_fn) { - *CrpFn = argv[optind++]; - } - if (*TheFunctionType == Supervised) { - *TsxFn = argv[optind++]; - *ProbFn = argv[optind++]; - *TaggedFn = argv[optind++]; - } - *UntaggedFn = argv[optind++]; - if (*TheFunctionType == Supervised && !get_crp_fn) { - *CrpFn = *UntaggedFn; - } - if (*TheFunctionType != Supervised) { - if (*TheFunctionType != Retrain) { - *TsxFn = argv[optind++]; - } - *ProbFn = argv[optind++]; - } -} - -void apertium_tagger::init_FILE_Tagger(FILE_Tagger &FILE_Tagger_, string const &TsxFn) { - FILE_Tagger_.deserialise(TsxFn); - FILE_Tagger_.set_debug(TheFlags.getDebug()); - TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); -} - -MorphoStream* apertium_tagger::setup_untagged_morpho_stream( - FILE_Tagger &FILE_Tagger_, - char *DicFn, char *UntaggedFn, - FILE **Dictionary, FILE **UntaggedCorpus) { - if (*TheFunctionType != Retrain) { - *Dictionary = try_open_file_utf8("DICTIONARY", DicFn, "r"); - } - *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", UntaggedFn, "r"); - - FILE_Tagger_.read_dictionary(*Dictionary); - - return new FileMorphoStream(*UntaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); -} - -void apertium_tagger::close_untagged_files( - char *DicFn, char *UntaggedFn, - FILE *Dictionary, FILE *UntaggedCorpus) { - if (*TheFunctionType == Supervised || *TheFunctionType == Train) { - try_close_file("DICTIONARY", DicFn, Dictionary); - } - try_close_file("UNTAGGED_CORPUS", UntaggedFn, UntaggedCorpus); -} - -/** Implementation of flags/subcommands */ - -void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { - locale_global_(); - - expect_file_arguments(nonoptarg, 1, 4); - - std::ifstream SerialisedAnalysisFrequencies; - try_open_fstream("SERIALISED_TAGGER", argv[optind], - SerialisedAnalysisFrequencies); - - try { - StreamTagger_.deserialise(SerialisedAnalysisFrequencies); - } catch (const basic_ExceptionType &basic_ExceptionType_) { - std::stringstream what_; - what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] - << "\" Reason: " << basic_ExceptionType_.what(); - throw Exception::apertium_tagger::deserialise(what_); - } - - if (nonoptarg < 2) { - Stream Input(TheFlags); - StreamTagger_.tag(Input, std::wcout); - return; - } - - std::wifstream Input_stream; - try_open_fstream("INPUT", argv[optind + 1], Input_stream); - - if (nonoptarg < 3) { - Stream Input(TheFlags, Input_stream, argv[optind + 1]); - StreamTagger_.tag(Input, std::wcout); - return; - } - - std::wofstream Output_stream; - try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); - - Stream Input(TheFlags, Input_stream, argv[optind + 1]); - StreamTagger_.tag(Input, Output_stream); -} - -void apertium_tagger::s_StreamTaggerTrainer( - StreamTaggerTrainer &StreamTaggerTrainer_) { - locale_global_(); - - if (TheFunctionTypeOptionArgument != 0 && *TheFunctionTypeType != Perceptron) { - std::stringstream what_; - what_ << "invalid argument '" << TheFunctionTypeOptionArgument - << "' for '--supervised'"; - throw Exception::apertium_tagger::InvalidArgument(what_); - } - - if (*TheFunctionTypeType == Perceptron) { - expect_file_arguments(nonoptarg, 4); - } else { - expect_file_arguments(nonoptarg, 2); - } - - std::wifstream TaggedCorpus_stream; - try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); - Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind + 1]); - - if (*TheFunctionTypeType == Perceptron) { - std::wifstream UntaggedCorpus_stream; - try_open_fstream("UNTAGGED_CORPUS", argv[optind + 2], UntaggedCorpus_stream); - Stream UntaggedCorpus(TheFlags, UntaggedCorpus_stream, argv[optind + 2]); - - PerceptronTagger &pt = dynamic_cast(StreamTaggerTrainer_); - pt.read_spec(argv[optind + 3]); - pt.train(TaggedCorpus, UntaggedCorpus, TheFunctionTypeOptionArgument); - } else { - StreamTaggerTrainer_.train(TaggedCorpus); - } - - std::ofstream Serialised_basic_Tagger; - try_open_fstream("SERIALISED_TAGGER", argv[optind], - Serialised_basic_Tagger); - - StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); -} - -void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { - LtLocale::tryToSetLocale(); - - expect_file_arguments(nonoptarg, 1, 4); - - FILE *Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); - FILE_Tagger_.deserialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); - - FILE_Tagger_.set_debug(TheFlags.getDebug()); - TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - TaggerWord::generate_marks = TheFlags.getMark(); - FILE_Tagger_.set_show_sf(TheFlags.getShowSuperficial()); - FILE_Tagger_.setNullFlush(TheFlags.getNullFlush()); - - if (nonoptarg < 2) - FILE_Tagger_.tagger(stdin, stdout, TheFlags.getFirst()); - else { - FILE *Input = try_open_file("INPUT", argv[optind + 1], "r"); - - if (nonoptarg < 3) - FILE_Tagger_.tagger(Input, stdout, TheFlags.getFirst()); - else { - FILE *Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); - FILE_Tagger_.tagger(Input, Output, TheFlags.getFirst()); - try_close_file("OUTPUT", argv[optind + 2], Output); - } - - try_close_file("INPUT", argv[optind + 1], Input); - } -} - -void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { - LtLocale::tryToSetLocale(); - - expect_file_arguments(nonoptarg, 2); - - char *ProbFn, *UntaggedFn; - - get_file_arguments( - false, - NULL, NULL, NULL, &UntaggedFn, - NULL, &ProbFn); - - FILE *Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", ProbFn, "rb"); - FILE_Tagger_.deserialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); - - FILE_Tagger_.set_debug(TheFlags.getDebug()); - TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); - - FILE *UntaggedCorpus; - MorphoStream* ms = setup_untagged_morpho_stream( - FILE_Tagger_, - NULL, UntaggedFn, - NULL, &UntaggedCorpus); - - FILE_Tagger_.train(*ms, TheFunctionTypeOptionArgument); - delete ms; - close_untagged_files( - NULL, UntaggedFn, - NULL, UntaggedCorpus); - - Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); - FILE_Tagger_.serialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); -} - -void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { - LtLocale::tryToSetLocale(); - - if (TheFunctionTypeOptionArgument == 0) { - expect_file_arguments(nonoptarg, 5, 7); - } else { - expect_file_arguments(nonoptarg, 6); - } - char *DicFn, *CrpFn, *TsxFn, *ProbFn, *TaggedFn, *UntaggedFn; - bool do_unsup = nonoptarg == 6; - - get_file_arguments( - do_unsup, - &DicFn, &CrpFn, &TaggedFn, &UntaggedFn, - &TsxFn, &ProbFn); - init_FILE_Tagger(FILE_Tagger_, TsxFn); - - FILE *Dictionary, *UntaggedCorpus; - MorphoStream* ms = setup_untagged_morpho_stream( - FILE_Tagger_, - DicFn, UntaggedFn, - &Dictionary, &UntaggedCorpus); - FILE *TaggedCorpus = try_open_file("TAGGED_CORPUS", TaggedFn, "r"); - FileMorphoStream tms(TaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); - - FILE_Tagger_.init_probabilities_from_tagged_text_(tms, *ms); - try_close_file("TAGGED_CORPUS", TaggedFn, TaggedCorpus); - delete ms; - close_untagged_files( - DicFn, UntaggedFn, - Dictionary, UntaggedCorpus); - - if (do_unsup) { - FILE *Corpus = try_open_file_utf8("CORPUS", CrpFn, "r"); - FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); - try_close_file("CORPUS", CrpFn, Corpus); - } - - FILE *Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); - FILE_Tagger_.serialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); -} - -void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { - LtLocale::tryToSetLocale(); - - expect_file_arguments(nonoptarg, 4); - - char *DicFn, *TsxFn, *ProbFn, *UntaggedFn; - UntaggedFn = NULL; - - get_file_arguments( - false, - &DicFn, NULL, NULL, &UntaggedFn, - &TsxFn, &ProbFn); - init_FILE_Tagger(FILE_Tagger_, TsxFn); - - FILE *Dictionary, *UntaggedCorpus; - MorphoStream* ms = setup_untagged_morpho_stream( - FILE_Tagger_, - DicFn, UntaggedFn, - &Dictionary, &UntaggedCorpus); - - FILE_Tagger_.init_and_train(*ms, TheFunctionTypeOptionArgument); - delete ms; - close_untagged_files( - DicFn, UntaggedFn, - Dictionary, UntaggedCorpus); - - FILE *Serialised_FILE_Tagger = - try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); - FILE_Tagger_.serialise(Serialised_FILE_Tagger); - try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); - -} -} - int main(int argc, char **argv) { try { - apertium_tagger(argc, argv); - } catch (const err_Exception &err_Exception_) { + Apertium::apertium_tagger(argc, argv); + } catch (const Apertium::err_Exception &err_Exception_) { std::wcerr << "Try 'apertium-tagger --help' for more information." << std::endl; return 1; } catch (...) { diff --git a/apertium/basic_stream_tagger.cc b/apertium/basic_stream_tagger.cc index 1d3d947..5055e4a 100644 --- a/apertium/basic_stream_tagger.cc +++ b/apertium/basic_stream_tagger.cc @@ -35,7 +35,7 @@ namespace Apertium { basic_StreamTagger::~basic_StreamTagger() {} -void basic_StreamTagger::tag(Stream &Input, std::wostream &Output) const { +void basic_StreamTagger::tag(Stream &Input, std::wostream &Output) { while (true) { StreamedType StreamedType_ = Input.get(); Output << StreamedType_.TheString; @@ -62,7 +62,7 @@ void basic_StreamTagger::tag(Stream &Input, std::wostream &Output) const { } void basic_StreamTagger::tag(const LexicalUnit &LexicalUnit_, - std::wostream &Output) const { + std::wostream &Output) { #if ENABLE_DEBUG for (std::vector::const_iterator Analysis_ = diff --git a/apertium/basic_stream_tagger.h b/apertium/basic_stream_tagger.h index 898dea4..fb105f9 100644 --- a/apertium/basic_stream_tagger.h +++ b/apertium/basic_stream_tagger.h @@ -37,7 +37,7 @@ class basic_StreamTagger : public StreamTagger { public: virtual ~basic_StreamTagger(); virtual void deserialise(std::istream &Serialised_basic_Tagger) = 0; - virtual void tag(Stream &Input, std::wostream &Output) const; + virtual void tag(Stream &Input, std::wostream &Output); protected: virtual long double score(const Analysis &Analysis_) const = 0; @@ -48,7 +48,7 @@ protected: #endif // ENABLE_DEBUG private: - void tag(const LexicalUnit &LexicalUnit_, std::wostream &Output) const; + void tag(const LexicalUnit &LexicalUnit_, std::wostream &Output); }; } diff --git a/apertium/basic_tagger.cc b/apertium/basic_tagger.cc index 980bd98..4af7619 100644 --- a/apertium/basic_tagger.cc +++ b/apertium/basic_tagger.cc @@ -20,33 +20,33 @@ basic_Tagger::Flags::Flags() : Debug(false), First(false), Mark(false), ShowSuperficial(false), NullFlush(false) {} -bool basic_Tagger::Flags::getDebug() const { return Debug; } +bool basic_Tagger::Flags::getDebug() { return Debug; } void basic_Tagger::Flags::setDebug(const bool &Debug_) { Debug = Debug_; } -bool basic_Tagger::Flags::getSentSeg() const { return SentSeg; } +bool basic_Tagger::Flags::getSentSeg() { return SentSeg; } void basic_Tagger::Flags::setSentSeg(const bool &SentSeg_) { SentSeg = SentSeg_; } -bool basic_Tagger::Flags::getSkipErrors() const { return SkipErrors; } +bool basic_Tagger::Flags::getSkipErrors() { return SkipErrors; } void basic_Tagger::Flags::setSkipErrors(const bool &SkipErrors_) { SkipErrors = SkipErrors_; } -bool basic_Tagger::Flags::getFirst() const { return First; } +bool basic_Tagger::Flags::getFirst() { return First; } void basic_Tagger::Flags::setFirst(const bool &First_) { First = First_; } -bool basic_Tagger::Flags::getMark() const { return Mark; } +bool basic_Tagger::Flags::getMark() { return Mark; } void basic_Tagger::Flags::setMark(const bool &Mark_) { Mark = Mark_; } -bool basic_Tagger::Flags::getShowSuperficial() const { return ShowSuperficial; } +bool basic_Tagger::Flags::getShowSuperficial() { return ShowSuperficial; } void basic_Tagger::Flags::setShowSuperficial(const bool &ShowSuperficial_) { ShowSuperficial = ShowSuperficial_; } -bool basic_Tagger::Flags::getNullFlush() const { return NullFlush; } +bool basic_Tagger::Flags::getNullFlush() { return NullFlush; } void basic_Tagger::Flags::setNullFlush(const bool &NullFlush_) { NullFlush = NullFlush_; @@ -54,5 +54,5 @@ void basic_Tagger::Flags::setNullFlush(const bool &NullFlush_) { basic_Tagger::basic_Tagger() : TheFlags() {} -basic_Tagger::basic_Tagger(const Flags &Flags_) : TheFlags(Flags_) {} +basic_Tagger::basic_Tagger(Flags &Flags_) : TheFlags(Flags_) {} } diff --git a/apertium/basic_tagger.h b/apertium/basic_tagger.h index c2f3f62..14a796b 100644 --- a/apertium/basic_tagger.h +++ b/apertium/basic_tagger.h @@ -22,19 +22,19 @@ public: class Flags { public: Flags(); - bool getDebug() const; + bool getDebug(); void setDebug(const bool &Debug_); - bool getSentSeg() const; - void setSentSeg(const bool &SentSeg); - bool getSkipErrors() const; + bool getSentSeg(); + void setSentSeg(const bool &SentSeg_); + bool getSkipErrors(); void setSkipErrors(const bool &SkipErrors_); - bool getFirst() const; + bool getFirst(); void setFirst(const bool &First_); - bool getMark() const; + bool getMark(); void setMark(const bool &Mark_); - bool getShowSuperficial() const; + bool getShowSuperficial(); void setShowSuperficial(const bool &ShowSuperficial_); - bool getNullFlush() const; + bool getNullFlush(); void setNullFlush(const bool &NullFlush_); static bool (Flags::*GetDebug)() const; static void (Flags::*SetDebug)(const bool &); @@ -59,7 +59,7 @@ public: protected: basic_Tagger(); - basic_Tagger(const Flags &Flags_); + basic_Tagger(Flags &Flags_); Flags TheFlags; }; } diff --git a/apertium/perceptron_tagger.cc b/apertium/perceptron_tagger.cc index 066d0f7..395fc0e 100644 --- a/apertium/perceptron_tagger.cc +++ b/apertium/perceptron_tagger.cc @@ -12,7 +12,7 @@ PerceptronTagger::PerceptronTagger(basic_Tagger::Flags flags) : basic_Tagger(fla PerceptronTagger::~PerceptronTagger() {}; -void PerceptronTagger::tag(Stream &in, std::wostream &out) const { +void PerceptronTagger::tag(Stream &in, std::wostream &out) { SentenceStream::SentenceTagger::tag(in, out, TheFlags.getSentSeg()); } @@ -30,7 +30,7 @@ operator<<(std::wostream &out, PerceptronTagger const &pt) { } TaggedSentence -PerceptronTagger::tagSentence(const Sentence &untagged_sent) const { +PerceptronTagger::tagSentence(const Sentence &untagged_sent) { const size_t sent_len = untagged_sent.size(); std::vector agenda; @@ -100,7 +100,7 @@ PerceptronTagger::tagSentence(const Sentence &untagged_sent) const { void PerceptronTagger::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) const { + std::wostream &output) { StreamTagger::outputLexicalUnit(lexical_unit, analysis, output); } diff --git a/apertium/perceptron_tagger.h b/apertium/perceptron_tagger.h index 344663c..919af2e 100644 --- a/apertium/perceptron_tagger.h +++ b/apertium/perceptron_tagger.h @@ -22,16 +22,16 @@ public: virtual void train(Stream &tagged, Stream &untagged, int iterations); // tagger virtual void deserialise(std::istream &serialised); - virtual void tag(Stream &input, std::wostream &output) const; + virtual void tag(Stream &input, std::wostream &output); void read_spec(const std::string &filename); friend std::wostream& operator<<(std::wostream &out, PerceptronTagger const &pt); protected: - virtual TaggedSentence tagSentence(const Sentence &untagged) const; + virtual TaggedSentence tagSentence(const Sentence &untagged); virtual void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) const; + std::wostream &output); private: bool trainSentence( const TrainingSentence &sentence, diff --git a/apertium/pretransfer.cc b/apertium/pretransfer.cc index 7ff1120..d684767 100644 --- a/apertium/pretransfer.cc +++ b/apertium/pretransfer.cc @@ -1,5 +1,13 @@ #include +#include +#include +#include +#include + +#include +#include + void readAndWriteUntil(FILE *input, FILE *output, int const charcode) { int mychar; diff --git a/apertium/pretransfer.h b/apertium/pretransfer.h index 09ecd86..4c0ce57 100644 --- a/apertium/pretransfer.h +++ b/apertium/pretransfer.h @@ -1,13 +1,23 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + #ifndef PRETRANSFER_H #define PRETRANSFER_H #include -#include - #include -#include -#include -#include void readAndWriteUntil(FILE *input, FILE *output, int const charcode); void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep); diff --git a/apertium/sentence_stream.cc b/apertium/sentence_stream.cc index cfcb7a4..39630ff 100644 --- a/apertium/sentence_stream.cc +++ b/apertium/sentence_stream.cc @@ -38,7 +38,7 @@ bool isSentenceEnd(StreamedType tok, Stream &in, bool sent_seg) { SentenceTagger::SentenceTagger() {} -void SentenceTagger::tag(Stream &in, std::wostream &out, bool sent_seg) const { +void SentenceTagger::tag(Stream &in, std::wostream &out, bool sent_seg) { clearBuffers(); while (true) { @@ -68,7 +68,7 @@ void SentenceTagger::clearBuffers() const { flushes.clear(); } -void SentenceTagger::tagAndPutSentence(std::wostream &out) const { +void SentenceTagger::tagAndPutSentence(std::wostream &out) { TaggedSentence tagged_sent = tagSentence(lexical_sent); TaggedSentence::const_iterator ts_it = tagged_sent.begin(); diff --git a/apertium/sentence_stream.h b/apertium/sentence_stream.h index c317c67..aea298e 100644 --- a/apertium/sentence_stream.h +++ b/apertium/sentence_stream.h @@ -20,16 +20,16 @@ namespace SentenceStream { bool isSentenceEnd(Stream &in, bool sent_seg = false); class SentenceTagger { public: - void tag(Stream &in, std::wostream &out, bool sent_seg) const; + void tag(Stream &in, std::wostream &out, bool sent_seg); SentenceTagger(); protected: - virtual TaggedSentence tagSentence(const Sentence &untagged) const = 0; + virtual TaggedSentence tagSentence(const Sentence &untagged) = 0; virtual void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) const = 0; + std::wostream &output) = 0; private: void clearBuffers() const; - void tagAndPutSentence(std::wostream &out) const; + void tagAndPutSentence(std::wostream &out); void putTaggedSent( std::wostream &out, TaggedSentence &tagged_sent, Sentence &full_sent, std::vector &flushes) const; diff --git a/apertium/stream.cc b/apertium/stream.cc index 3426513..ab14f23 100644 --- a/apertium/stream.cc +++ b/apertium/stream.cc @@ -30,23 +30,23 @@ #include namespace Apertium { -Stream::Stream(const basic_Tagger::Flags &Flags_) +Stream::Stream(basic_Tagger::Flags &Flags_) : TheCharacterStream(std::wcin), TheFilename(), TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} -Stream::Stream(const basic_Tagger::Flags &Flags_, +Stream::Stream(basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, const char *const Filename_) : TheCharacterStream(CharacterStream_), TheFilename(Filename_), TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} -Stream::Stream(const basic_Tagger::Flags &Flags_, +Stream::Stream(basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, const std::string &Filename_) : TheCharacterStream(CharacterStream_), TheFilename(Filename_), TheLineNumber(1), TheLine(), TheFlags(Flags_), private_flush_(false), ThePreviousCase() {} -Stream::Stream(const basic_Tagger::Flags &Flags_, +Stream::Stream(basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, const std::stringstream &Filename_) : TheCharacterStream(CharacterStream_), TheFilename(Filename_.str()), @@ -652,7 +652,7 @@ bool Stream::flush_() const { return private_flush_; } void Stream::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output, const basic_Tagger::Flags &flags) { + std::wostream &output, basic_Tagger::Flags &flags) { using namespace std::rel_ops; output << L"^"; diff --git a/apertium/stream.h b/apertium/stream.h index bb3b104..18aa83a 100644 --- a/apertium/stream.h +++ b/apertium/stream.h @@ -28,12 +28,12 @@ namespace Apertium { class Stream { public: - Stream(const basic_Tagger::Flags &Flags_); - Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + Stream(basic_Tagger::Flags &Flags_); + Stream(basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, const char *const Filename_); - Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + Stream(basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, const std::string &Filename_); - Stream(const basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, + Stream(basic_Tagger::Flags &Flags_, std::wifstream &CharacterStream_, const std::stringstream &Filename_); StreamedType get(); StreamedType peek(); @@ -42,7 +42,7 @@ public: static void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output, const basic_Tagger::Flags &flags); + std::wostream &output, basic_Tagger::Flags &flags); std::size_t TheLineNumber; private: @@ -66,7 +66,7 @@ private: std::wistream &TheCharacterStream; Optional TheFilename; std::wstring TheLine; - const basic_Tagger::Flags &TheFlags; + basic_Tagger::Flags &TheFlags; bool private_flush_ : 1; Optional ThePreviousCase; }; diff --git a/apertium/stream_5_3_1_tagger.cc b/apertium/stream_5_3_1_tagger.cc index 7268800..34c2809 100644 --- a/apertium/stream_5_3_1_tagger.cc +++ b/apertium/stream_5_3_1_tagger.cc @@ -36,7 +36,7 @@ #endif // ENABLE_DEBUG namespace Apertium { -Stream_5_3_1_Tagger::Stream_5_3_1_Tagger(const Flags &Flags_) +Stream_5_3_1_Tagger::Stream_5_3_1_Tagger(Flags &Flags_) : basic_Tagger(Flags_), basic_5_3_1_Tagger() {} void Stream_5_3_1_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { diff --git a/apertium/stream_5_3_1_tagger.h b/apertium/stream_5_3_1_tagger.h index aab04c8..66563b7 100644 --- a/apertium/stream_5_3_1_tagger.h +++ b/apertium/stream_5_3_1_tagger.h @@ -34,7 +34,7 @@ namespace Apertium { class Stream_5_3_1_Tagger : private basic_5_3_1_Tagger, public basic_StreamTagger { public: - Stream_5_3_1_Tagger(const Flags &Flags_); + Stream_5_3_1_Tagger(Flags &Flags_); void deserialise(std::istream &Serialised_basic_Tagger); private: diff --git a/apertium/stream_5_3_1_tagger_trainer.cc b/apertium/stream_5_3_1_tagger_trainer.cc index 615b22f..d14b4ba 100644 --- a/apertium/stream_5_3_1_tagger_trainer.cc +++ b/apertium/stream_5_3_1_tagger_trainer.cc @@ -26,7 +26,7 @@ namespace Apertium { Stream_5_3_1_TaggerTrainer::Stream_5_3_1_TaggerTrainer( - const basic_Tagger::Flags &Flags_) + basic_Tagger::Flags &Flags_) : basic_Tagger(Flags_), basic_5_3_1_Tagger() {} void Stream_5_3_1_TaggerTrainer::serialise( diff --git a/apertium/stream_5_3_1_tagger_trainer.h b/apertium/stream_5_3_1_tagger_trainer.h index 486f7be..a3d00fc 100644 --- a/apertium/stream_5_3_1_tagger_trainer.h +++ b/apertium/stream_5_3_1_tagger_trainer.h @@ -28,7 +28,7 @@ namespace Apertium { class Stream_5_3_1_TaggerTrainer : private basic_5_3_1_Tagger, public basic_StreamTaggerTrainer { public: - Stream_5_3_1_TaggerTrainer(const Flags &Flags_); + Stream_5_3_1_TaggerTrainer(Flags &Flags_); void serialise(std::ostream &Serialised_basic_Tagger) const; private: diff --git a/apertium/stream_5_3_2_tagger.cc b/apertium/stream_5_3_2_tagger.cc index 1060711..8ec5293 100644 --- a/apertium/stream_5_3_2_tagger.cc +++ b/apertium/stream_5_3_2_tagger.cc @@ -34,7 +34,7 @@ #endif // ENABLE_DEBUG namespace Apertium { -Stream_5_3_2_Tagger::Stream_5_3_2_Tagger(const Flags &Flags_) +Stream_5_3_2_Tagger::Stream_5_3_2_Tagger(Flags &Flags_) : basic_Tagger(Flags_), basic_5_3_2_Tagger() {} void Stream_5_3_2_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { diff --git a/apertium/stream_5_3_2_tagger.h b/apertium/stream_5_3_2_tagger.h index 397d729..4b7c992 100644 --- a/apertium/stream_5_3_2_tagger.h +++ b/apertium/stream_5_3_2_tagger.h @@ -34,7 +34,7 @@ namespace Apertium { class Stream_5_3_2_Tagger : private basic_5_3_2_Tagger, public basic_StreamTagger { public: - Stream_5_3_2_Tagger(const Flags &Flags_); + Stream_5_3_2_Tagger(Flags &Flags_); void deserialise(std::istream &Serialised_basic_Tagger); private: diff --git a/apertium/stream_5_3_2_tagger_trainer.cc b/apertium/stream_5_3_2_tagger_trainer.cc index f7f54d3..3051ed7 100644 --- a/apertium/stream_5_3_2_tagger_trainer.cc +++ b/apertium/stream_5_3_2_tagger_trainer.cc @@ -25,7 +25,7 @@ #include namespace Apertium { -Stream_5_3_2_TaggerTrainer::Stream_5_3_2_TaggerTrainer(const Flags &Flags_) +Stream_5_3_2_TaggerTrainer::Stream_5_3_2_TaggerTrainer(Flags &Flags_) : basic_Tagger(Flags_) {} void Stream_5_3_2_TaggerTrainer::serialise( diff --git a/apertium/stream_5_3_2_tagger_trainer.h b/apertium/stream_5_3_2_tagger_trainer.h index 04c2a07..39b4959 100644 --- a/apertium/stream_5_3_2_tagger_trainer.h +++ b/apertium/stream_5_3_2_tagger_trainer.h @@ -25,7 +25,7 @@ namespace Apertium { class Stream_5_3_2_TaggerTrainer : private basic_5_3_2_Tagger, public basic_StreamTaggerTrainer { public: - Stream_5_3_2_TaggerTrainer(const Flags &Flags_); + Stream_5_3_2_TaggerTrainer(Flags &Flags_); void serialise(std::ostream &Serialised_basic_Tagger) const; private: diff --git a/apertium/stream_5_3_3_tagger.cc b/apertium/stream_5_3_3_tagger.cc index 9eefc59..d28193f 100644 --- a/apertium/stream_5_3_3_tagger.cc +++ b/apertium/stream_5_3_3_tagger.cc @@ -33,7 +33,7 @@ #endif // ENABLE_DEBUG namespace Apertium { -Stream_5_3_3_Tagger::Stream_5_3_3_Tagger(const Flags &Flags_) +Stream_5_3_3_Tagger::Stream_5_3_3_Tagger(Flags &Flags_) : basic_Tagger(Flags_) {} void Stream_5_3_3_Tagger::deserialise(std::istream &Serialised_basic_Tagger) { diff --git a/apertium/stream_5_3_3_tagger.h b/apertium/stream_5_3_3_tagger.h index 9b7c5ca..a101fa8 100644 --- a/apertium/stream_5_3_3_tagger.h +++ b/apertium/stream_5_3_3_tagger.h @@ -36,7 +36,7 @@ namespace Apertium { class Stream_5_3_3_Tagger : private basic_5_3_3_Tagger, public basic_StreamTagger { public: - Stream_5_3_3_Tagger(const Flags &Flags_); + Stream_5_3_3_Tagger(Flags &Flags_); void deserialise(std::istream &Serialised_basic_Tagger); private: diff --git a/apertium/stream_5_3_3_tagger_trainer.cc b/apertium/stream_5_3_3_tagger_trainer.cc index bd6c47c..56f1e55 100644 --- a/apertium/stream_5_3_3_tagger_trainer.cc +++ b/apertium/stream_5_3_3_tagger_trainer.cc @@ -26,7 +26,7 @@ #include namespace Apertium { -Stream_5_3_3_TaggerTrainer::Stream_5_3_3_TaggerTrainer(const Flags &Flags_) +Stream_5_3_3_TaggerTrainer::Stream_5_3_3_TaggerTrainer(Flags &Flags_) : basic_Tagger(Flags_) {} void Stream_5_3_3_TaggerTrainer::serialise( diff --git a/apertium/stream_5_3_3_tagger_trainer.h b/apertium/stream_5_3_3_tagger_trainer.h index 7e2e5d6..ca6108c 100644 --- a/apertium/stream_5_3_3_tagger_trainer.h +++ b/apertium/stream_5_3_3_tagger_trainer.h @@ -26,7 +26,7 @@ namespace Apertium { class Stream_5_3_3_TaggerTrainer : private basic_5_3_3_Tagger, public basic_StreamTaggerTrainer { public: - Stream_5_3_3_TaggerTrainer(const Flags &Flags_); + Stream_5_3_3_TaggerTrainer(Flags &Flags_); void serialise(std::ostream &Serialised_basic_Tagger) const; private: diff --git a/apertium/stream_tagger.cc b/apertium/stream_tagger.cc index 7311399..dc6576f 100644 --- a/apertium/stream_tagger.cc +++ b/apertium/stream_tagger.cc @@ -7,7 +7,7 @@ StreamTagger::~StreamTagger() {} void StreamTagger::outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) const { + std::wostream &output) { Stream::outputLexicalUnit(lexical_unit, analysis, output, TheFlags); } } diff --git a/apertium/stream_tagger.h b/apertium/stream_tagger.h index 94b5a14..1b4e38f 100644 --- a/apertium/stream_tagger.h +++ b/apertium/stream_tagger.h @@ -11,10 +11,10 @@ class StreamTagger : protected virtual basic_Tagger { public: virtual ~StreamTagger(); virtual void deserialise(std::istream &Serialised_basic_Tagger) = 0; - virtual void tag(Stream &Input, std::wostream &Output) const = 0; + virtual void tag(Stream &Input, std::wostream &Output) = 0; void outputLexicalUnit( const LexicalUnit &lexical_unit, const Optional analysis, - std::wostream &output) const; + std::wostream &output); }; } diff --git a/apertium/tagger.cc b/apertium/tagger.cc new file mode 100644 index 0000000..2bd17b4 --- /dev/null +++ b/apertium/tagger.cc @@ -0,0 +1,810 @@ +// Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License as +// published by the Free Software Foundation; either version 2 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, see . + +#include + +#include "apertium_config.h" + +#include "align.h" +#include "basic_exception_type.h" +#include "basic_stream_tagger.h" +#include "basic_stream_tagger_trainer.h" +#include "basic_tagger.h" +#include "err_exception.h" +#include "exception.h" +#include "file_tagger.h" +#include "linebreak.h" +#include "stream_5_3_1_tagger.h" +#include "stream_5_3_1_tagger_trainer.h" +#include "stream_5_3_2_tagger.h" +#include "stream_5_3_2_tagger_trainer.h" +#include "stream_5_3_3_tagger.h" +#include "stream_5_3_3_tagger_trainer.h" +#include +#include +#include +#include +#include + +#include + +#include "getopt_long.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Apertium { +using namespace ShellUtils; +using namespace tagger_utils; + +/** Top level argument parsing */ + +apertium_tagger::apertium_tagger(int &argc, char **&argv) + : argc(argc), argv(argv), The_val(), nonoptarg(), + + The_indexptr(), FunctionTypeTypeOption_indexptr(), + FunctionTypeOption_indexptr(), + + TheFunctionTypeType(), TheUnigramType(), TheFunctionType(), + TheFunctionTypeOptionArgument(0), TheFlags() { + try { + /*Set optind so that multiple instances can be created */ + optind = 1; + while (true) { + The_val = getopt_long(argc, argv, "bdfegmpr:s:t:u:wxz", longopts, &The_indexptr); + + if (The_val == -1) + break; + + set_indexptr(); + + switch (The_val) { + case 'b': + flagOptionCase(&basic_Tagger::Flags::getSentSeg, + &basic_Tagger::Flags::setSentSeg); + break; + case 'd': + flagOptionCase(&basic_Tagger::Flags::getDebug, + &basic_Tagger::Flags::setDebug); + break; + case 'e': + flagOptionCase(&basic_Tagger::Flags::getSkipErrors, + &basic_Tagger::Flags::setSkipErrors); + break; + case 'f': + flagOptionCase(&basic_Tagger::Flags::getFirst, + &basic_Tagger::Flags::setFirst); + break; + case 'm': + flagOptionCase(&basic_Tagger::Flags::getMark, + &basic_Tagger::Flags::setMark); + break; + case 'p': + flagOptionCase(&basic_Tagger::Flags::getShowSuperficial, + &basic_Tagger::Flags::setShowSuperficial); + break; + case 'z': + flagOptionCase(&basic_Tagger::Flags::getNullFlush, + &basic_Tagger::Flags::setNullFlush); + break; + case 'u': + functionTypeTypeOptionCase(Unigram); + + if (std::strncmp(optarg, "1", sizeof "1" - 1) == 0) { + TheUnigramType = Stream_5_3_1; + break; + } + + if (std::strncmp(optarg, "2", sizeof "2" - 1) == 0) { + TheUnigramType = Stream_5_3_2; + break; + } + + if (std::strncmp(optarg, "3", sizeof "3" - 1) == 0) { + TheUnigramType = Stream_5_3_3; + break; + } + + { + std::stringstream what_; + what_ << "invalid argument '" << optarg << "' for '--unigram'\n" + "Valid arguments are:\n" + " - '1'\n" + " - '2'\n" + " - '3'"; + throw Exception::apertium_tagger::InvalidArgument(what_); + } + break; + case 'w': + functionTypeTypeOptionCase(SlidingWindow); + break; + case 'x': + functionTypeTypeOptionCase(Perceptron); + break; + case 'g': + functionTypeOptionCase(Tagger); + break; + case 'r': + functionTypeOptionCase(Retrain); + getIterationsArgument(); + break; + case 's': + functionTypeOptionCase(Supervised); + getIterationsArgument(); + break; + case 't': + functionTypeOptionCase(Train); + getIterationsArgument(); + break; + case 'h': + help(); + return; + default: + throw err_Exception(); + } + } + + if (!TheFunctionType) { + help(); + return; + } + + nonoptarg = argc - optind; + switch (*TheFunctionType) { + case Tagger: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + g_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + switch (*TheFunctionTypeType) { + case Unigram: { + switch (*TheUnigramType) { + case Stream_5_3_1: { + Stream_5_3_1_Tagger Stream_5_3_1_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_1_Tagger_); + } break; + case Stream_5_3_2: { + Stream_5_3_2_Tagger Stream_5_3_2_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_2_Tagger_); + } break; + case Stream_5_3_3: { + Stream_5_3_3_Tagger Stream_5_3_3_Tagger_(TheFlags); + g_StreamTagger(Stream_5_3_3_Tagger_); + } break; + default: + std::abort(); + } + } break; + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + g_FILE_Tagger(SlidingWindowTagger_); + } break; + case Perceptron: { + PerceptronTagger perceptron(TheFlags); + g_StreamTagger(perceptron); + } break; + default: + std::abort(); + } + + break; + case Retrain: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + r_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + std::stringstream what_; + what_ << "invalid option -- 'u'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + r_FILE_Tagger(SlidingWindowTagger_); + } break; + default: + std::abort(); + } + + break; + case Supervised: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + s_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + switch (*TheUnigramType) { + case Stream_5_3_1: { + Stream_5_3_1_TaggerTrainer Stream_5_3_1_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_1_TaggerTrainer_); + } break; + case Stream_5_3_2: { + Stream_5_3_2_TaggerTrainer Stream_5_3_2_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_2_TaggerTrainer_); + } break; + case Stream_5_3_3: { + Stream_5_3_3_TaggerTrainer Stream_5_3_3_TaggerTrainer_(TheFlags); + s_StreamTaggerTrainer(Stream_5_3_3_TaggerTrainer_); + } break; + default: + std::abort(); + } + } break; + case SlidingWindow: { + std::stringstream what_; + what_ << "invalid option -- 'w'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } break; + case Perceptron: { + PerceptronTagger perceptron(TheFlags); + s_StreamTaggerTrainer(perceptron); + } break; + default: + std::abort(); + } + + break; + case Train: + if (!TheFunctionTypeType) { + HMM HiddenMarkovModelTagger_; + t_FILE_Tagger(HiddenMarkovModelTagger_); + break; + } + + switch (*TheFunctionTypeType) { + case Unigram: { + std::stringstream what_; + what_ << "invalid option -- 'u'"; + throw Exception::apertium_tagger::InvalidOption(what_); + } + case SlidingWindow: { + LSWPoST SlidingWindowTagger_; + t_FILE_Tagger(SlidingWindowTagger_); + } break; + default: + std::abort(); + } + + break; + default: + std::abort(); + } + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::wcerr << "apertium-tagger: " << basic_ExceptionType_.what() << std::endl; + throw err_Exception(); + } +} + +void apertium_tagger::help() { + + std::wcerr << +"Usage: apertium-tagger [OPTION]... -g SERIALISED_TAGGER \\\n" +" [INPUT \\\n" +" [OUTPUT]]\n" +"\n" +" or: apertium-tagger [OPTION]... -r ITERATIONS \\\n" +" CORPUS \\\n" +" SERIALISED_TAGGER\n" +"\n" +" or: apertium-tagger [OPTION]... -s ITERATIONS \\\n" +" DICTIONARY \\\n" +" CORPUS \\\n" +" TAGGER_SPECIFICATION \\\n" +" SERIALISED_TAGGER \\\n" +" TAGGED_CORPUS \\\n" +" UNTAGGED_CORPUS\n" +"\n" +" or: apertium-tagger [OPTION]... -s 0 \\\n" +" DICTIONARY \\\n" +" TAGGER_SPECIFICATION \\\n" +" SERIALISED_TAGGER \\\n" +" TAGGED_CORPUS \\\n" +" UNTAGGED_CORPUS\n" +"\n" +" or: apertium-tagger [OPTION]... -s 0 \\\n" +" -u MODEL \\\n" +" SERIALISED_TAGGER \\\n" +" TAGGED_CORPUS\n" +"\n" +" or: apertium-tagger [OPTION]... -t ITERATIONS \\\n" +" DICTIONARY \\\n" +" CORPUS \\\n" +" TAGGER_SPECIFICATION \\\n" +" SERIALISED_TAGGER\n" +"\n" +"Mandatory arguments to long options are mandatory for short options too.\n" +"\n"; + + std::vector > options_description_; + options_description_.push_back(std::make_pair("-d, --debug", "with -g, print error messages about the input")); + options_description_.push_back(std::make_pair("-f, --first", "with -g, reorder each lexical unit's analyses so that the chosen one is first")); + options_description_.push_back(std::make_pair("-m, --mark", "with -g, mark disambiguated lexical units")); + options_description_.push_back(std::make_pair("-p, --show-superficial", "with -g, output each lexical unit's surface form")); + options_description_.push_back(std::make_pair("-z, --null-flush", "with -g, flush the output after getting each null character")); + align::align_(options_description_); + std::wcerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-u, --unigram=MODEL", "use unigram algorithm MODEL from ")); + align::align_(options_description_); + std::wcerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-w, --sliding-window", "use the Light Sliding Window algorithm")); + options_description_.push_back(std::make_pair("-x, --perceptron", "use the averaged perceptron algorithm")); + options_description_.push_back(std::make_pair("-e, --skip-on-error", "with -xs, ignore certain types of errors with the training corpus")); + align::align_(options_description_); + std::wcerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-g, --tagger", "disambiguate the input")); + align::align_(options_description_); + std::wcerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-r, --retrain=ITERATIONS", "with -u: exit;\notherwise: retrain the tagger with ITERATIONS unsupervised iterations")); + options_description_.push_back(std::make_pair("-s, --supervised=ITERATIONS", "with -u: train the tagger with a hand-tagged corpus;\nwith -w: exit;\notherwise: initialise the tagger with a hand-tagged corpus and retrain it with ITERATIONS unsupervised iterations")); + options_description_.push_back(std::make_pair("-t, --train=ITERATIONS", "with -u: exit;\notherwise: train the tagger with ITERATIONS unsupervised iterations")); + align::align_(options_description_); + std::wcerr << '\n'; + options_description_.clear(); + options_description_.push_back(std::make_pair("-h, --help", "display this help and exit")); + align::align_(options_description_); +} + +const struct option apertium_tagger::longopts[] = { + {"help", no_argument, 0, 'h'}, + {"sent-seg", no_argument, 0, 'b'}, + {"debug", no_argument, 0, 'd'}, + {"skip-on-error", no_argument, 0, 'e'}, + {"first", no_argument, 0, 'f'}, + {"mark", no_argument, 0, 'm'}, + {"show-superficial", no_argument, 0, 'p'}, + {"null-flush", no_argument, 0, 'z'}, + {"unigram", required_argument, 0, 'u'}, + {"sliding-window", no_argument, 0, 'w'}, + {"perceptron", no_argument, 0, 'x'}, + {"tagger", no_argument, 0, 'g'}, + {"retrain", required_argument, 0, 'r'}, + {"supervised", required_argument, 0, 's'}, + {"train", required_argument, 0, 't'}, + {0, 0, 0, 0}}; + +/** Utilities */ + +std::string apertium_tagger::option_string(const int &indexptr_) { + return option_string(longopts[indexptr_]); +} + +std::string apertium_tagger::option_string(const struct option &option_) { + std::stringstream option_string_; + option_string_ << "--" << option_.name; + return option_string_.str(); +} + +void apertium_tagger::locale_global_() { + +#if defined __clang__ + + std::locale::global(std::locale("")); + +#else +#if defined __APPLE__ + + LtLocale::tryToSetLocale(); + +#else + + std::locale::global(std::locale("")); + +#endif // defined __APPLE__ +#endif // defined __clang__ +} + +void apertium_tagger::set_indexptr() { + if (The_val == longopts[The_indexptr].val) + return; + + for (std::size_t longopts_Index = 0; longopts[longopts_Index].val != 0; + ++longopts_Index) { + if (The_val == longopts[longopts_Index].val) { + The_indexptr = longopts_Index; + return; + } + } +} + +void apertium_tagger::flagOptionCase( + bool (basic_Tagger::Flags::*GetFlag)(), + void (basic_Tagger::Flags::*SetFlag)(const bool &)) { + if ((TheFlags.*GetFlag)()) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string() << '\''; + throw Exception::apertium_tagger::UnexpectedFlagOption(what_); + } + + (TheFlags.*SetFlag)(true); +} + +std::string apertium_tagger::option_string() { + return option_string(The_indexptr); +} + +void apertium_tagger::functionTypeTypeOptionCase( + const FunctionTypeType &FunctionTypeType_) { + if (FunctionTypeTypeOption_indexptr) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string(*FunctionTypeTypeOption_indexptr) + << '\''; + throw Exception::apertium_tagger::UnexpectedFunctionTypeTypeOption(what_); + } + + TheFunctionTypeType = FunctionTypeType_; + FunctionTypeTypeOption_indexptr = The_indexptr; +} + +void apertium_tagger::functionTypeOptionCase( + const FunctionType &FunctionType_) { + if (FunctionTypeOption_indexptr) { + std::stringstream what_; + what_ << "unexpected '" << option_string() << "' following '" + << option_string(*FunctionTypeOption_indexptr) + << '\''; + throw Exception::apertium_tagger::UnexpectedFunctionTypeOption(what_); + } + + TheFunctionType = FunctionType_; + FunctionTypeOption_indexptr = The_indexptr; +} + +void apertium_tagger::getIterationsArgument() { + try { + TheFunctionTypeOptionArgument = optarg_unsigned_long("ITERATIONS"); + } catch (const ExceptionType &ExceptionType_) { + std::stringstream what_; + what_ << "invalid argument '" << optarg << "' for '" << option_string() + << '\''; + throw Exception::apertium_tagger::InvalidArgument(what_); + } +} + +static unsigned long parse_unsigned_long(const char *metavar, const char *val) { + char *str_end; + errno = 0; + unsigned long N_0 = std::strtoul(val, &str_end, 10); + + if (*str_end != '\0') { + std::stringstream what_; + what_ << "can't convert " << metavar << " \"" << val << "\" to unsigned long"; + throw Exception::apertium_tagger::str_end_not_eq_NULL(what_); + } + + if (*val == '\0') { + std::stringstream what_; + what_ << "can't convert " << metavar << " of size 1 \"\" to unsigned long"; + throw Exception::apertium_tagger::optarg_eq_NULL(what_); + } + + if (errno == ERANGE) { + std::stringstream what_; + what_ << "can't convert " << metavar << " \"" << val + << "\" to unsigned long, not in unsigned long range"; + throw Exception::apertium_tagger::ERANGE_(what_); + } + + return N_0; +} + +unsigned long apertium_tagger::optarg_unsigned_long(const char *metavar) { + return parse_unsigned_long(metavar, optarg); +} + +void apertium_tagger::get_file_arguments( + bool get_crp_fn, + char **DicFn, char **CrpFn, + char **TaggedFn, char **UntaggedFn, + char **TsxFn, char **ProbFn) { + if (*TheFunctionType != Retrain) { + *DicFn = argv[optind++]; + } + if (get_crp_fn) { + *CrpFn = argv[optind++]; + } + if (*TheFunctionType == Supervised) { + *TsxFn = argv[optind++]; + *ProbFn = argv[optind++]; + *TaggedFn = argv[optind++]; + } + *UntaggedFn = argv[optind++]; + if (*TheFunctionType == Supervised && !get_crp_fn) { + *CrpFn = *UntaggedFn; + } + if (*TheFunctionType != Supervised) { + if (*TheFunctionType != Retrain) { + *TsxFn = argv[optind++]; + } + *ProbFn = argv[optind++]; + } +} + +void apertium_tagger::init_FILE_Tagger(FILE_Tagger &FILE_Tagger_, string const &TsxFn) { + FILE_Tagger_.deserialise(TsxFn); + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); +} + +MorphoStream* apertium_tagger::setup_untagged_morpho_stream( + FILE_Tagger &FILE_Tagger_, + char *DicFn, char *UntaggedFn, + FILE **Dictionary, FILE **UntaggedCorpus) { + if (*TheFunctionType != Retrain) { + *Dictionary = try_open_file_utf8("DICTIONARY", DicFn, "r"); + } + *UntaggedCorpus = try_open_file_utf8("UNTAGGED_CORPUS", UntaggedFn, "r"); + + FILE_Tagger_.read_dictionary(*Dictionary); + + return new FileMorphoStream(*UntaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); +} + +void apertium_tagger::close_untagged_files( + char *DicFn, char *UntaggedFn, + FILE *Dictionary, FILE *UntaggedCorpus) { + if (*TheFunctionType == Supervised || *TheFunctionType == Train) { + try_close_file("DICTIONARY", DicFn, Dictionary); + } + try_close_file("UNTAGGED_CORPUS", UntaggedFn, UntaggedCorpus); +} + +/** Implementation of flags/subcommands */ + +void apertium_tagger::g_StreamTagger(StreamTagger &StreamTagger_) { + locale_global_(); + + expect_file_arguments(nonoptarg, 1, 4); + + std::ifstream SerialisedAnalysisFrequencies; + try_open_fstream("SERIALISED_TAGGER", argv[optind], + SerialisedAnalysisFrequencies); + + try { + StreamTagger_.deserialise(SerialisedAnalysisFrequencies); + } catch (const basic_ExceptionType &basic_ExceptionType_) { + std::stringstream what_; + what_ << "can't deserialise SERIALISED_TAGGER file \"" << argv[optind] + << "\" Reason: " << basic_ExceptionType_.what(); + throw Exception::apertium_tagger::deserialise(what_); + } + if (nonoptarg < 2) { + Stream Input(TheFlags); + StreamTagger_.tag(Input, std::wcout); + return; + } + + std::wifstream Input_stream; + try_open_fstream("INPUT", argv[optind + 1], Input_stream); + + if (nonoptarg < 3) { + Stream Input(TheFlags, Input_stream, argv[optind + 1]); + StreamTagger_.tag(Input, std::wcout); + return; + } + + std::wofstream Output_stream; + try_open_fstream("OUTPUT", argv[optind + 2], Input_stream); + + Stream Input(TheFlags, Input_stream, argv[optind + 1]); + StreamTagger_.tag(Input, Output_stream); +} + +void apertium_tagger::s_StreamTaggerTrainer( + StreamTaggerTrainer &StreamTaggerTrainer_) { + locale_global_(); + + if (TheFunctionTypeOptionArgument != 0 && *TheFunctionTypeType != Perceptron) { + std::stringstream what_; + what_ << "invalid argument '" << TheFunctionTypeOptionArgument + << "' for '--supervised'"; + throw Exception::apertium_tagger::InvalidArgument(what_); + } + + if (*TheFunctionTypeType == Perceptron) { + expect_file_arguments(nonoptarg, 4); + } else { + expect_file_arguments(nonoptarg, 2); + } + + std::wifstream TaggedCorpus_stream; + try_open_fstream("TAGGED_CORPUS", argv[optind + 1], TaggedCorpus_stream); + Stream TaggedCorpus(TheFlags, TaggedCorpus_stream, argv[optind + 1]); + + if (*TheFunctionTypeType == Perceptron) { + std::wifstream UntaggedCorpus_stream; + try_open_fstream("UNTAGGED_CORPUS", argv[optind + 2], UntaggedCorpus_stream); + Stream UntaggedCorpus(TheFlags, UntaggedCorpus_stream, argv[optind + 2]); + + PerceptronTagger &pt = dynamic_cast(StreamTaggerTrainer_); + pt.read_spec(argv[optind + 3]); + pt.train(TaggedCorpus, UntaggedCorpus, TheFunctionTypeOptionArgument); + } else { + StreamTaggerTrainer_.train(TaggedCorpus); + } + + std::ofstream Serialised_basic_Tagger; + try_open_fstream("SERIALISED_TAGGER", argv[optind], + Serialised_basic_Tagger); + + StreamTaggerTrainer_.serialise(Serialised_basic_Tagger); +} + +void apertium_tagger::g_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + expect_file_arguments(nonoptarg, 1, 4); + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", argv[optind], "rb"); + FILE_Tagger_.deserialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", argv[optind], Serialised_FILE_Tagger); + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + TaggerWord::generate_marks = TheFlags.getMark(); + FILE_Tagger_.set_show_sf(TheFlags.getShowSuperficial()); + FILE_Tagger_.setNullFlush(TheFlags.getNullFlush()); + if (nonoptarg < 2) + FILE_Tagger_.tagger(stdin, stdout, TheFlags.getFirst()); + else { + FILE *Input = try_open_file("INPUT", argv[optind + 1], "r"); + + if (nonoptarg < 3) + FILE_Tagger_.tagger(Input, stdout, TheFlags.getFirst()); + else { + FILE *Output = try_open_file_utf8("OUTPUT", argv[optind + 2], "w"); + FILE_Tagger_.tagger(Input, Output, TheFlags.getFirst()); + try_close_file("OUTPUT", argv[optind + 2], Output); + } + + try_close_file("INPUT", argv[optind + 1], Input); + } +} + +void apertium_tagger::r_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + expect_file_arguments(nonoptarg, 2); + + char *ProbFn, *UntaggedFn; + + get_file_arguments( + false, + NULL, NULL, NULL, &UntaggedFn, + NULL, &ProbFn); + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", ProbFn, "rb"); + FILE_Tagger_.deserialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); + + FILE_Tagger_.set_debug(TheFlags.getDebug()); + TaggerWord::setArrayTags(FILE_Tagger_.getArrayTags()); + + FILE *UntaggedCorpus; + MorphoStream* ms = setup_untagged_morpho_stream( + FILE_Tagger_, + NULL, UntaggedFn, + NULL, &UntaggedCorpus); + + FILE_Tagger_.train(*ms, TheFunctionTypeOptionArgument); + delete ms; + close_untagged_files( + NULL, UntaggedFn, + NULL, UntaggedCorpus); + + Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); +} + +void apertium_tagger::s_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + if (TheFunctionTypeOptionArgument == 0) { + expect_file_arguments(nonoptarg, 5, 7); + } else { + expect_file_arguments(nonoptarg, 6); + } + char *DicFn, *CrpFn, *TsxFn, *ProbFn, *TaggedFn, *UntaggedFn; + bool do_unsup = nonoptarg == 6; + + get_file_arguments( + do_unsup, + &DicFn, &CrpFn, &TaggedFn, &UntaggedFn, + &TsxFn, &ProbFn); + init_FILE_Tagger(FILE_Tagger_, TsxFn); + + FILE *Dictionary, *UntaggedCorpus; + MorphoStream* ms = setup_untagged_morpho_stream( + FILE_Tagger_, + DicFn, UntaggedFn, + &Dictionary, &UntaggedCorpus); + FILE *TaggedCorpus = try_open_file("TAGGED_CORPUS", TaggedFn, "r"); + FileMorphoStream tms(TaggedCorpus, true, &FILE_Tagger_.get_tagger_data()); + + FILE_Tagger_.init_probabilities_from_tagged_text_(tms, *ms); + try_close_file("TAGGED_CORPUS", TaggedFn, TaggedCorpus); + delete ms; + close_untagged_files( + DicFn, UntaggedFn, + Dictionary, UntaggedCorpus); + + if (do_unsup) { + FILE *Corpus = try_open_file_utf8("CORPUS", CrpFn, "r"); + FILE_Tagger_.train(Corpus, TheFunctionTypeOptionArgument); + try_close_file("CORPUS", CrpFn, Corpus); + } + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); +} + +void apertium_tagger::t_FILE_Tagger(FILE_Tagger &FILE_Tagger_) { + LtLocale::tryToSetLocale(); + + expect_file_arguments(nonoptarg, 4); + + char *DicFn, *TsxFn, *ProbFn, *UntaggedFn; + UntaggedFn = NULL; + + get_file_arguments( + false, + &DicFn, NULL, NULL, &UntaggedFn, + &TsxFn, &ProbFn); + init_FILE_Tagger(FILE_Tagger_, TsxFn); + + FILE *Dictionary, *UntaggedCorpus; + MorphoStream* ms = setup_untagged_morpho_stream( + FILE_Tagger_, + DicFn, UntaggedFn, + &Dictionary, &UntaggedCorpus); + + FILE_Tagger_.init_and_train(*ms, TheFunctionTypeOptionArgument); + delete ms; + close_untagged_files( + DicFn, UntaggedFn, + Dictionary, UntaggedCorpus); + + FILE *Serialised_FILE_Tagger = + try_open_file("SERIALISED_TAGGER", ProbFn, "wb"); + FILE_Tagger_.serialise(Serialised_FILE_Tagger); + try_close_file("SERIALISED_TAGGER", ProbFn, Serialised_FILE_Tagger); + +} +} diff --git a/apertium/apertium_tagger.h b/apertium/tagger.h similarity index 94% rename from apertium/apertium_tagger.h rename to apertium/tagger.h index f375ebd..86063e4 100644 --- a/apertium/apertium_tagger.h +++ b/apertium/tagger.h @@ -13,8 +13,8 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, see . -#ifndef APERTIUM_TAGGER_H -#define APERTIUM_TAGGER_H +#ifndef TAGGER_H +#define TAGGER_H #include "apertium_config.h" @@ -22,8 +22,10 @@ #include "basic_stream_tagger_trainer.h" #include "basic_tagger.h" #include "constructor_eq_delete.h" +#include "err_exception.h" #include "file_tagger.h" #include "optional.h" +#include #include "getopt_long.h" #include @@ -44,7 +46,7 @@ private: static std::string option_string(const struct option &option_); static void locale_global_(); void set_indexptr(); - void flagOptionCase(bool (basic_Tagger::Flags::*GetFlag)() const, + void flagOptionCase(bool (basic_Tagger::Flags::*GetFlag)(), void (basic_Tagger::Flags::*SetFlag)(const bool &)); std::string option_string(); void functionTypeTypeOptionCase(const FunctionTypeType &FunctionTypeType_); @@ -94,4 +96,4 @@ private: }; } -#endif // APERTIUM_TAGGER_H +#endif // TAGGER_H diff --git a/python/apertium_core.i b/python/apertium_core.i index 5751e78..672eb68 100644 --- a/python/apertium_core.i +++ b/python/apertium_core.i @@ -5,6 +5,7 @@ #include #include #include +#include #include class apertium: public Transfer, public Interchunk, public Postchunk @@ -19,6 +20,16 @@ public: void transfer_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); }; +class tagger: public Apertium::apertium_tagger +{ +public: + /** + * Imitates functionality of apertium-tagger + * tagger::tagger() passes int and char** to apertium_tagger::apertium_tagger() int&, char**& respectively + */ + tagger(int argc, char **argv): apertium_tagger(argc, argv){} +}; + void apertium::transfer_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path) { @@ -76,16 +87,50 @@ apertium::postchunk_text(char arg, char *transferfile, char *datafile, char *inp %include %include %include +%include %include +// Wrapper on char ** for char **argv +// Modified for python 3 from http://www.swig.org/Doc1.3/Python.html#Python_nn59 + +%typemap(in) char ** { + if (PyList_Check($input)) { + int size = PyList_Size($input); + int i = 0; + $1 = (char **) malloc((size+1)*sizeof(char *)); + for (i = 0; i < size; i++) { + PyObject *py_obj = PyList_GetItem($input, i); + if (PyUnicode_Check(py_obj)) { + $1[i] = strdup(PyUnicode_AsUTF8(py_obj)); + } + else { + PyErr_SetString(PyExc_TypeError, "list must contain strings"); + free($1); + return NULL; + } + } + $1[i] = 0; + } else { + PyErr_SetString(PyExc_TypeError, "not a list"); + return NULL; + } +} + +%typemap(freearg) char ** { + free((char *) $1); +} + class apertium: public Transfer, public Interchunk, public Postchunk { public: - /** - * Imitates functionality of apertium-core binaries using file path - */ void interchunk_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); void pretransfer(char arg, char *input_path, char *output_path); void postchunk_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); void transfer_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); }; + +class tagger: public Apertium::apertium_tagger +{ +public: + tagger(int argc, char **argv): apertium_tagger(argc, argv); +}; diff --git a/python/setup.py.in b/python/setup.py.in index de18a2b..42ab1e6 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -19,11 +19,25 @@ class CustomBuild(build): def get_sources(): sources = ['apertium_core.i'] - cc_sources = ['apertium_re.cc', 'interchunk.cc', 'interchunk_word.cc', 'postchunk.cc', - 'pretransfer.cc', 'string_utils.cc', 'transfer.cc', 'transfer_data.cc', - 'transfer_instr.cc', 'transfer_mult.cc', 'transfer_token.cc', - 'transfer_word.cc', 'transfer_word_list.cc', 'trx_reader.cc', - 'utf_converter.cc', 'xml_reader.cc'] + cc_sources = [ + # interchunk.cc postchunk.cc transfer.cc + 'apertium_re.cc', 'interchunk.cc', 'interchunk_word.cc', 'postchunk.cc', 'string_utils.cc', 'transfer.cc', + 'transfer_data.cc', 'transfer_instr.cc', 'transfer_mult.cc', 'transfer_token.cc', 'transfer_word.cc', + 'transfer_word_list.cc', 'trx_reader.cc', 'utf_converter.cc', 'xml_reader.cc', + # 'pretransfer.cc' + 'pretransfer.cc', + # tagger.cc + 'a.cc', 'align.cc', 'analysis.cc', 'basic_5_3_1_tagger.cc', 'basic_5_3_2_tagger.cc', 'basic_exception_type.cc', + 'basic_stream_tagger.cc', 'basic_stream_tagger_trainer.cc', 'basic_tagger.cc', 'collection.cc', + 'constant_manager.cc', 'endian_double_util.cc', 'exception_type.cc', 'feature_vec.cc', + 'feature_vec_averager.cc', 'file_morpho_stream.cc', 'file_tagger.cc', 'hmm.cc', 'i.cc', 'lemma.cc', + 'linebreak.cc', 'lswpost.cc', 'morpheme.cc', 'morpho_stream.cc', 'mtx_reader.cc', 'perceptron_spec.cc', + 'perceptron_tagger.cc', 'sentence_stream.cc', 'shell_utils.cc', 'stream.cc', 'stream_5_3_1_tagger.cc', + 'stream_5_3_1_tagger_trainer.cc', 'stream_5_3_2_tagger.cc', 'stream_5_3_2_tagger_trainer.cc', + 'stream_5_3_3_tagger.cc', 'stream_5_3_3_tagger_trainer.cc', 'stream_tagger.cc', 'stream_tagger_trainer.cc', + 'tag.cc', 'tagger.cc', 'tagger_data.cc', 'tagger_data_hmm.cc', 'tagger_data_lsw.cc', + 'tagger_data_percep_coarse_tags.cc', 'tagger_utils.cc', 'tagger_word.cc', 'tsx_reader.cc', + 'wchar_t_exception_type.cc'] rel_path = '@top_srcdir@/apertium' sources.extend(path.join(rel_path, f) for f in cc_sources) return sources