commit e1735e5fc6ccded722b045e4e43524e920571f91 Author: Lokendra Singh Date: Sat Jul 6 17:59:57 2019 +0530 Swig wrapper (#51) Python wrapper for interchunk, postchunk, transfer, pretransfer. By Lokendra Singh diff --git a/.gitignore b/.gitignore index 6855cf4..c056295 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,12 @@ test-driver /tests/data/nno-nob.t1x.bin /tests/test-suite.log /compile_commands.json + +/python/Makefile.in +/python/Makefile +/python/apertium_core.py +/python/apertium_core_wrap.cpp +/python/setup.py +/python/build* +*.egg-info/ +*.egg diff --git a/Makefile.am b/Makefile.am index bae0ea7..5c6ff78 100644 --- a/Makefile.am +++ b/Makefile.am @@ -11,6 +11,10 @@ pkgconfig_DATA = apertium.pc aclocaldir = $(datadir)/aclocal aclocal_DATA = apertium.m4 +if HAVE_PYTHON_BINDINGS +SUBDIRS += python +endif + EXTRA_DIST=autogen.sh README-MODES apertium.m4 utf8 tests install-data-local: diff --git a/apertium/Makefile.am b/apertium/Makefile.am index d2d4ba3..377c1a0 100644 --- a/apertium/Makefile.am +++ b/apertium/Makefile.am @@ -40,6 +40,7 @@ h_sources = a.h \ optional.h \ perceptron_spec.h \ perceptron_tagger.h \ + pretransfer.h \ postchunk.h \ sentence_stream.h \ serialiser.h \ @@ -131,6 +132,7 @@ cc_sources = a.cc \ file_morpho_stream.cc \ perceptron_spec.cc \ perceptron_tagger.cc \ + pretransfer.cc \ postchunk.cc \ sentence_stream.cc \ stream.cc \ @@ -260,6 +262,7 @@ apertium_DATA = deformat.xsl reformat.xsl new2old.xsl lexchoice.xsl \ apertium-createmodes.awk apertium_pretransfer_SOURCES = apertium_pretransfer.cc +apertium_pretransfer_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) $(lib_LTLIBRARIES) apertium_multiple_translations_SOURCES = apertium-multiple-translations.cc apertium_multiple_translations_LDADD = $(APERTIUM_LIBS) -lapertium$(GENERIC_MAJOR_VERSION) $(lib_LTLIBRARIES) apertium_destxt_SOURCES = apertium_destxt.cc diff --git a/apertium/apertium_pretransfer.cc b/apertium/apertium_pretransfer.cc index abdf33e..c46ab54 100644 --- a/apertium/apertium_pretransfer.cc +++ b/apertium/apertium_pretransfer.cc @@ -21,163 +21,16 @@ #include #include "getopt_long.h" -#include -#include "apertium_config.h" -#include - #ifdef _MSC_VER #include #include #endif +#include #include using namespace Apertium; using namespace std; -bool compound_sep = false; - -void readAndWriteUntil(FILE *input, FILE *output, int const charcode) -{ - int mychar; - - while((mychar = fgetwc_unlocked(input)) != charcode) - { - if(feof(input)) - { - wcerr << L"ERROR: Unexpected EOF" << endl; - exit(EXIT_FAILURE); - } - fputwc_unlocked(mychar, output); - if(mychar == L'\\') - { - mychar = fgetwc(input); - fputwc(mychar, output); - } - } -} - -void procWord(FILE *input, FILE *output, bool surface_forms) -{ - int mychar; - wstring buffer = L""; - - bool buffer_mode = false; - bool in_tag = false; - bool queuing = false; - - if(surface_forms) - { - while((mychar = fgetwc_unlocked(input)) != L'/') ; - } - - while((mychar = fgetwc_unlocked(input)) != L'$') - { - if(feof(input)) - { - wcerr << L"ERROR: Unexpected EOF" << endl; - exit(EXIT_FAILURE); - } - - switch(mychar) - { - case L'<': - in_tag = true; - if(!buffer_mode) - { - buffer_mode = true; - } - break; - - case L'>': - in_tag = false; - break; - - case L'#': - if(buffer_mode) - { - buffer_mode = false; - queuing = true; - } - break; - } - - if(buffer_mode) - { - if((mychar != L'+' || (mychar == L'+' && in_tag == true)) && - (mychar != L'~' || (mychar == L'~' && in_tag == true))) - { - buffer += static_cast(mychar); - } - else if(in_tag == false && mychar == L'+') - { - buffer.append(L"$ ^"); - } - else if(in_tag == false && mychar == L'~' and compound_sep == true) - { - buffer.append(L"$^"); - } - } - else - { - if(mychar == L'+' && queuing == true) - { - buffer.append(L"$ ^"); - buffer_mode = true; - } - else - { - fputwc_unlocked(mychar, output); - } - } - - } - fputws_unlocked(buffer.c_str(), output); -} - -void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms) -{ - while(true) - { - int mychar = fgetwc_unlocked(input); - if(feof(input)) - { - break; - } - switch(mychar) - { - case L'[': - fputwc_unlocked(L'[', output); - readAndWriteUntil(input, output, L']'); - fputwc_unlocked(L']', output); - break; - - case L'\\': - fputwc_unlocked(mychar, output); - fputwc_unlocked(fgetwc_unlocked(input), output); - break; - - case L'^': - fputwc_unlocked(mychar, output); - procWord(input, output, surface_forms); - fputwc_unlocked(L'$', output); - break; - - case L'\0': - fputwc_unlocked(mychar, output); - - if(null_flush) - { - fflush(output); - } - break; - - default: - fputwc_unlocked(mychar, output); - break; - } - } -} - void usage(char *progname) { wcerr << L"USAGE: " << basename(progname) << L" [input_file [output_file]]" << endl; @@ -188,12 +41,10 @@ void usage(char *progname) exit(EXIT_FAILURE); } - - - int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); + bool compound_sep = false; bool null_flush = false; bool surface_forms = false; @@ -277,5 +128,5 @@ int main(int argc, char *argv[]) _setmode(_fileno(output), _O_U8TEXT); #endif - processStream(input, output, null_flush, surface_forms); + processStream(input, output, null_flush, surface_forms, compound_sep); } diff --git a/apertium/pretransfer.cc b/apertium/pretransfer.cc new file mode 100644 index 0000000..7ff1120 --- /dev/null +++ b/apertium/pretransfer.cc @@ -0,0 +1,143 @@ +#include + +void readAndWriteUntil(FILE *input, FILE *output, int const charcode) +{ + int mychar; + + while((mychar = fgetwc_unlocked(input)) != charcode) + { + if(feof(input)) + { + wcerr << L"ERROR: Unexpected EOF" << endl; + exit(EXIT_FAILURE); + } + fputwc_unlocked(mychar, output); + if(mychar == L'\\') + { + mychar = fgetwc(input); + fputwc(mychar, output); + } + } +} + +void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep) +{ + int mychar; + wstring buffer = L""; + + bool buffer_mode = false; + bool in_tag = false; + bool queuing = false; + + if(surface_forms) + { + while((mychar = fgetwc_unlocked(input)) != L'/') ; + } + + while((mychar = fgetwc_unlocked(input)) != L'$') + { + if(feof(input)) + { + wcerr << L"ERROR: Unexpected EOF" << endl; + exit(EXIT_FAILURE); + } + + switch(mychar) + { + case L'<': + in_tag = true; + if(!buffer_mode) + { + buffer_mode = true; + } + break; + + case L'>': + in_tag = false; + break; + + case L'#': + if(buffer_mode) + { + buffer_mode = false; + queuing = true; + } + break; + } + + if(buffer_mode) + { + if((mychar != L'+' || (mychar == L'+' && in_tag == true)) && + (mychar != L'~' || (mychar == L'~' && in_tag == true))) + { + buffer += static_cast(mychar); + } + else if(in_tag == false && mychar == L'+') + { + buffer.append(L"$ ^"); + } + else if(in_tag == false && mychar == L'~' and compound_sep == true) + { + buffer.append(L"$^"); + } + } + else + { + if(mychar == L'+' && queuing == true) + { + buffer.append(L"$ ^"); + buffer_mode = true; + } + else + { + fputwc_unlocked(mychar, output); + } + } + + } + fputws_unlocked(buffer.c_str(), output); +} + +void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms, bool compound_sep) +{ + while(true) + { + int mychar = fgetwc_unlocked(input); + if(feof(input)) + { + break; + } + switch(mychar) + { + case L'[': + fputwc_unlocked(L'[', output); + readAndWriteUntil(input, output, L']'); + fputwc_unlocked(L']', output); + break; + + case L'\\': + fputwc_unlocked(mychar, output); + fputwc_unlocked(fgetwc_unlocked(input), output); + break; + + case L'^': + fputwc_unlocked(mychar, output); + procWord(input, output, surface_forms, compound_sep); + fputwc_unlocked(L'$', output); + break; + + case L'\0': + fputwc_unlocked(mychar, output); + + if(null_flush) + { + fflush(output); + } + break; + + default: + fputwc_unlocked(mychar, output); + break; + } + } +} diff --git a/apertium/pretransfer.h b/apertium/pretransfer.h new file mode 100644 index 0000000..09ecd86 --- /dev/null +++ b/apertium/pretransfer.h @@ -0,0 +1,16 @@ +#ifndef PRETRANSFER_H +#define PRETRANSFER_H + +#include +#include + +#include +#include +#include +#include + +void readAndWriteUntil(FILE *input, FILE *output, int const charcode); +void procWord(FILE *input, FILE *output, bool surface_forms, bool compound_sep); +void processStream(FILE *input, FILE *output, bool null_flush, bool surface_forms, bool compound_sep); + +#endif diff --git a/configure.ac b/configure.ac index c326415..46afc87 100644 --- a/configure.ac +++ b/configure.ac @@ -191,4 +191,18 @@ AX_CHECK_COMPILE_FLAG([-std=c++20], [CXXFLAGS="$CXXFLAGS -std=c++20"], [ ]) ]) -AC_OUTPUT([Makefile apertium.pc apertium/Makefile tests/Makefile tests/tagger/Makefile]) +AC_CONFIG_FILES([python/setup.py]) + +AC_SUBST(APERTIUM_CFLAGS) +AC_SUBST(LIBS) + +AM_PATH_PYTHON([3.4], [], [AC_MSG_WARN([Can't generate SWIG wrapper or run tests without Python])]) + +AC_ARG_ENABLE([python-bindings], + AS_HELP_STRING([--enable-python-bindings], + [build python bindings (default=disabled)]), + [enable_python_bindings=$enableval], + [enable_python_bindings=no]) +AM_CONDITIONAL([HAVE_PYTHON_BINDINGS], [test x$enable_python_bindings = xyes]) + +AC_OUTPUT([Makefile apertium.pc apertium/Makefile tests/Makefile tests/tagger/Makefile python/Makefile]) diff --git a/python/Makefile.am b/python/Makefile.am new file mode 100644 index 0000000..8aa782b --- /dev/null +++ b/python/Makefile.am @@ -0,0 +1,9 @@ +SWIG_INTERFACE = apertium_core.i + +BUILT_SOURCES = %_wrap.cpp + +%_wrap.cpp: $(SWIG_INTERFACE) setup.py + $(PYTHON) setup.py build + +install-exec-local: + $(PYTHON) setup.py install --prefix=$(DESTDIR)$(prefix) diff --git a/python/apertium_core.i b/python/apertium_core.i new file mode 100644 index 0000000..5751e78 --- /dev/null +++ b/python/apertium_core.i @@ -0,0 +1,91 @@ +%module apertium_core + +%{ +#define SWIG_FILE_WITH_INIT +#include +#include +#include +#include + +class apertium: public Transfer, public Interchunk, public Postchunk +{ +public: + /** + * Imitates functionality of apertium-core binaries using file path + */ + void interchunk_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); + void pretransfer(char arg, char *input_path, char *output_path); + void postchunk_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); + void transfer_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); +}; + +void +apertium::transfer_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path) +{ + FILE *input = fopen(input_path, "r"), *output = fopen(output_path, "w"); + + switch(arg) + { + case 'b': + setPreBilingual(true); + setUseBilingual(false); + break; + + case 'n': + setUseBilingual(false); + break; + } + Transfer::read(transferfile, datafile); + transfer(input, output); + fclose(input); + fclose(output); +} + +void +apertium::interchunk_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path) +{ + FILE *input = fopen(input_path, "r"), *output = fopen(output_path, "w"); + Interchunk::read(transferfile, datafile); + interchunk(input, output); + fclose(input); + fclose(output); +} + +void +apertium::pretransfer(char arg, char *input_path, char *output_path) +{ + bool useMaxEnt = false; + FILE *input = fopen(input_path, "r"), *output = fopen(output_path, "w"); + processStream(input, output, false, false, false); + fclose(input); + fclose(output); +} + +void +apertium::postchunk_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path) +{ + FILE *input = fopen(input_path, "r"), *output = fopen(output_path, "w"); + Postchunk::read(transferfile, datafile); + postchunk(input, output); + fclose(input); + fclose(output); +} + +%} + +%include +%include +%include +%include + +class apertium: public Transfer, public Interchunk, public Postchunk +{ +public: + /** + * Imitates functionality of apertium-core binaries using file path + */ + void interchunk_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); + void pretransfer(char arg, char *input_path, char *output_path); + void postchunk_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); + void transfer_text(char arg, char *transferfile, char *datafile, char *input_path, char *output_path); +}; diff --git a/python/setup.py.in b/python/setup.py.in new file mode 100644 index 0000000..de18a2b --- /dev/null +++ b/python/setup.py.in @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +''' +Setup for SWIG Python bindings for apertium +''' +from os import path +from distutils.core import Extension, setup +from distutils.command.build import build + + +class CustomBuild(build): + sub_commands = [ + ('build_ext', build.has_ext_modules), + ('build_py', build.has_pure_modules), + ('build_clib', build.has_c_libraries), + ('build_scripts', build.has_scripts), + ] + + +def get_sources(): + sources = ['apertium_core.i'] + cc_sources = ['apertium_re.cc', 'interchunk.cc', 'interchunk_word.cc', 'postchunk.cc', + 'pretransfer.cc', 'string_utils.cc', 'transfer.cc', 'transfer_data.cc', + 'transfer_instr.cc', 'transfer_mult.cc', 'transfer_token.cc', + 'transfer_word.cc', 'transfer_word_list.cc', 'trx_reader.cc', + 'utf_converter.cc', 'xml_reader.cc'] + rel_path = '@top_srcdir@/apertium' + sources.extend(path.join(rel_path, f) for f in cc_sources) + return sources + +def get_include_dirs(): + # Remove '-I' from Flags, as python add '-I' on its own + dirs = '@APERTIUM_CFLAGS@'.replace('-I', '').split() + return dirs + ['..'] + + +apertium_core_module = Extension( + name='_apertium_core', + sources=get_sources(), + swig_opts=['-c++', '-I..', '-I@top_srcdir@/apertium', '-Wall']+'@APERTIUM_CFLAGS@'.split(), + include_dirs=get_include_dirs(), + library_dirs=['/usr/include/libxml2', '/usr/local/lib'], + extra_compile_args='@CPPFLAGS@'.split()+'@CXXFLAGS@'.split(), + extra_link_args='@LIBS@'.split(), +) + +setup( + name='@PACKAGE@', + version='@PACKAGE_VERSION@', + description='SWIG interface to @PACKAGE_NAME@', + long_description='SWIG interface to @PACKAGE_NAME@ for use in apertium-python', + # TODO: author, maintainer, url + author_email='@PACKAGE_BUGREPORT@', + license='GPL-3.0+', + maintainer_email='@PACKAGE_BUGREPORT@', + cmdclass={'build': CustomBuild}, + ext_modules=[apertium_core_module], + py_modules=['apertium_core'], +)