commit 743ed7f770eb55daa5db5094cc11d70bdc715e63 Author: Daniel Swanson Date: Tue Aug 3 11:24:12 2021 -0500 mmap-able files diff --git a/configure.ac b/configure.ac index 4bcad76..15a9dae 100644 --- a/configure.ac +++ b/configure.ac @@ -54,6 +54,8 @@ AC_CHECK_FUNCS([setlocale strdup]) AC_CHECK_DECLS([fread_unlocked, fwrite_unlocked, fgetc_unlocked, fputc_unlocked, fputs_unlocked]) +AC_CHECK_HEADERS([string_view]) + CPPFLAGS="$CPPFLAGS $CFLAGS $LTTOOLBOX_CFLAGS $LIBXML_CFLAGS $ICU_CFLAGS" LIBS="$LIBS $LTTOOLBOX_LIBS $LIBXML_LIBS $ICU_LIBS" diff --git a/python/setup.py.in b/python/setup.py.in index 85973a7..2461036 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -3,42 +3,25 @@ ''' Setup for SWIG Python bindings for lex-tools ''' -from os import path from distutils.core import Extension, setup -from distutils.command.build import build - - -class CustomBuild(build): - sub_commands = [ - ('build_ext', build.has_ext_modules), - ('build_py', build.has_pure_modules), - ('build_clib', build.has_c_libraries), - ('build_scripts', build.has_scripts), - ] - - -def get_sources(): - sources = ['apertium_lex_tools.i'] - cc_sources = ['lrx_processor.cc'] - rel_path = '../src' - sources.extend(path.join(rel_path, f) for f in cc_sources) - return sources - -def get_include_dirs(): - # Remove '-I' from Flags, as python add '-I' on its own - dirs = '@LTTOOLBOX_CFLAGS@'.replace('-I', '').split() - dirs += '@LIBXML_CFLAGS@'.replace('-I', '').split() - return dirs + ['../src'] +from sys import platform +compile_args = '@CXXFLAGS@'.split() + '@LTTOOLBOX_CFLAGS@'.split() + '@ICU_CFLAGS@'.split() +link_args = [] +if platform == 'darwin': + compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] + link_args.append('-mmacosx-version-min=10.7') apertium_lex_tools_module = Extension( name='_apertium_lex_tools', - sources=get_sources(), - swig_opts=['-c++', '-I../src', '-Wall']+'@LTTOOLBOX_CFLAGS@'.split()+'@LIBXML_CFLAGS@'.split()+'@ICU_CFLAGS@'.split(), - include_dirs=get_include_dirs(), - library_dirs=['/usr/include/libxml2', '/usr/local/lib'], - extra_compile_args='@CXXFLAGS@'.split(), - extra_link_args=['-lxml2', '-llttoolbox3'], + language='c++', + sources=['apertium_lex_tools.i'], + swig_opts=['-c++', '-I..', '-I@top_srcdir@/src', '-Wall'], + include_dirs=['@top_srcdir@', '@top_srcdir@/src'], + library_dirs=['@top_srcdir@/src/.libs'], + libraries=[], + extra_compile_args=compile_args, + extra_link_args=link_args, ) setup( @@ -50,7 +33,7 @@ setup( author_email='@PACKAGE_BUGREPORT@', license='GPL-3.0+', maintainer_email='@PACKAGE_BUGREPORT@', - cmdclass={'build': CustomBuild}, ext_modules=[apertium_lex_tools_module], py_modules=['apertium_lex_tools'], + data_files=[] ) diff --git a/src/binary_header.h b/src/binary_header.h new file mode 100644 index 0000000..8df71b4 --- /dev/null +++ b/src/binary_header.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2021 Apertium + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _LRX_BINARY_HEADER_ +#define _LRX_BINARY_HEADER_ + +#include +#include + +// Global lttoolbox features +constexpr char HEADER_LRX[4]{'A', 'L', 'R', 'X'}; +enum LRX_FEATURES : uint64_t { + LRX_MMAP = (1ull << 0), // using mmap-compatible format rather than compressed format + LRX_UNKNOWN = (1ull << 1), // Features >= this are unknown, so throw an error; Inc this if more features are added + LRX_RESERVED = (1ull << 63), // If we ever reach this many feature flags, we need a flag to know how to extend beyond 64 bits +}; + +struct weight { + int32_t id; + char _pad[4]{}; + double pisu; +}; + +inline void weight_to_le(weight& w) { + uint32_t id = static_cast(w.id); + to_le_32(id); + + uint64_t pisu = *reinterpret_cast(&w.pisu); + to_le_64(pisu); +} + +inline void weight_from_le(weight& w) { + uint32_t id = static_cast(w.id); + from_le_32(id); + + uint64_t pisu = *reinterpret_cast(&w.pisu); + from_le_64(pisu); +} + +#endif diff --git a/src/lrx_compiler.cc b/src/lrx_compiler.cc index c9dd3aa..01c3427 100644 --- a/src/lrx_compiler.cc +++ b/src/lrx_compiler.cc @@ -16,12 +16,11 @@ */ #include -#include #include #include -#include #include #include +#include using namespace std; @@ -908,32 +907,45 @@ LRXCompiler::procSeq() void LRXCompiler::write(FILE *fst) { - alphabet.write(fst); + fwrite_unlocked(HEADER_LRX, 1, 4, fst); + uint64_t features = 0; + features |= LRX_MMAP; + write_le_64(fst, features); - Compression::multibyte_write(recognisers.size(), fst); - for(auto& it : recognisers) - { - Compression::string_write(it.first, fst); + StringWriter sw; + for (auto& it : alphabet.getTags()) { + sw.add(it); + } + for (auto& it : recognisers) { + sw.add(it.first); + } + sw.write(fst); + + alphabet.write_mmap(fst, sw); + + write_le_64(fst, recognisers.size()); + for (auto& it : recognisers) { + StringRef loc = sw.add(it.first); + write_le_32(fst, loc.start); + write_le_32(fst, loc.count); + it.second.write_mmap(fst, alphabet); debug("+ %d => %S\n", it.second.size(), it.first.c_str()); if (debugMode) { it.second.show(alphabet, debug_output, 0, false); } - it.second.write(fst); } - Compression::string_write("main"_u, fst); if(outputGraph) { transducer.show(alphabet, debug_output, 0, false); } - transducer.write(fst); + transducer.write_mmap(fst, alphabet); + write_le_64(fst, weights.size()); for(auto& it : weights) { debug("%.4f %d\n", it.second, it.first); - weight record{it.first, "", it.second}; - weight_to_le(record); - fwrite((void *)&record, 1, sizeof(weight), fst); + write_le_double(fst, it.second); } if(!outputGraph) diff --git a/src/lrx_processor.cc b/src/lrx_processor.cc index 8f6f100..8498e10 100644 --- a/src/lrx_processor.cc +++ b/src/lrx_processor.cc @@ -15,11 +15,14 @@ * along with this program; if not, see . */ -#include +#include #include #include #include #include +#include +#include +#include using namespace std; @@ -45,22 +48,15 @@ LRXProcessor::itow(int i) LRXProcessor::LRXProcessor() -{ - - initial_state = new State(); - - lineno = 1; // Used for rule tracing - pos = 0; - - traceMode = false; - debugMode = false; - outOfWord = true; - nullFlush = false; -} + : alphabet(&str_write), initial_state(new State()) +{} LRXProcessor::~LRXProcessor() { delete initial_state; + if (mmapping) { + munmap(mmap_pointer, mmap_len); + } } void @@ -84,60 +80,119 @@ LRXProcessor::setDebugMode(bool m) void LRXProcessor::load(FILE *in) { - alphabet.read(in); - any_char = alphabet(LRX_PROCESSOR_TAG_ANY_CHAR); - any_tag = alphabet(LRX_PROCESSOR_TAG_ANY_TAG); - any_upper = alphabet(LRX_PROCESSOR_TAG_ANY_UPPER); - any_lower = alphabet(LRX_PROCESSOR_TAG_ANY_LOWER); - word_boundary = alphabet(LRX_PROCESSOR_TAG_WORD_BOUNDARY); - - int len = Compression::multibyte_read(in); - - while(len > 0) - { - UString name = Compression::string_read(in); - recognisers[name].read(in, alphabet); - if(debugMode) - { - cerr << "Recogniser: " << name << ", [finals: " << recognisers[name].getFinals().size() << "]\n"; + bool mmap = false; + fpos_t pos; + if (fgetpos(in, &pos) == 0) { + char header[4]{}; + if (fread_unlocked(header, 1, 4, in) == 4 && + strncmp(header, HEADER_LRX, 4) == 0) { + auto features = read_le_64(in); + if (features >= LRX_UNKNOWN) { + throw std::runtime_error("Rule file has features that are unknown to this version of apertium-lex-tools - upgrade!"); + } + mmap = features & LRX_MMAP; + } else { + fsetpos(in, &pos); } - len--; } - if(debugMode) - { - cerr << "recognisers: " << recognisers.size() << endl; - } + if(mmap) { + fgetpos(in, &pos); + rewind(in); + mmapping = mmap_file(in, mmap_pointer, mmap_len); + if (mmapping) { + void* ptr = mmap_pointer + 12; + ptr = str_write.init(ptr); + + ptr = alphabet.init(ptr); + + uint64_t recognizer_count = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + for (uint64_t i = 0; i < recognizer_count; i++) { + StringRef tn = reinterpret_cast(ptr)[0]; + ptr += sizeof(StringRef); + UString name = UString{str_write.get(tn)}; + ptr = recognisers[name].init(ptr); + } - UString name = Compression::string_read(in); + ptr = transducer.init(ptr); - transducer.read(in, alphabet); + uint64_t weight_count = reinterpret_cast(ptr)[0]; + ptr += sizeof(uint64_t); + double* weight_list = reinterpret_cast(ptr); + for (uint64_t i = 0; i < weight_count; i++) { + UString sid = "<"_u + itow(i + 1) + ">"_u; + weights[sid] = weight_list[i]; + } + } else { + fsetpos(in, &pos); - // Now read in weights - weight record; - while(fread(&record, sizeof(weight), 1, in)) - { - weight_from_le(record); - UString sid = "<"_u + itow(record.id) + ">"_u; - weights[sid] = record.pisu; + str_write.read(in); - /* - if(debugMode) - { - cerr << sid << " " << record.id << " weight(" << record.pisu << ")\n"; + alphabet.read(in, true); + + uint64_t recognizer_count = read_le_64(in); + for (uint64_t i = 0; i < recognizer_count; i++) { + uint32_t s = read_le_32(in); + uint32_t c = read_le_32(in); + UString name = UString{str_write.get(s, c)}; + recognisers[name].read(in); + } + + transducer.read(in); + + uint64_t weight_count = read_le_double(in); + for (uint64_t i = 0; i < weight_count; i++) { + UString sid = "<"_u + itow(i + 1) + ">"_u; + weights[sid] = read_le_double(in); + } + } + } else { + Alphabet temp_alpha; + temp_alpha.read(in); + fsetpos(in, &pos); + alphabet.read(in, false); + + int len = Compression::multibyte_read(in); + + while(len > 0) { + UString name = Compression::string_read(in); + recognisers[name].read_compressed(in, temp_alpha); + if(debugMode) { + //cerr << "Recogniser: " << name << ", [finals: " << recognisers[name].getFinals().size() << "]\n"; + } + len--; + } + + if(debugMode) { + cerr << "recognisers: " << recognisers.size() << endl; + } + + UString name = Compression::string_read(in); + + transducer.read_compressed(in, temp_alpha); + + // Now read in weights + weight record; + while(fread(&record, sizeof(weight), 1, in)) { + weight_from_le(record); + UString sid = "<"_u + itow(record.id) + ">"_u; + weights[sid] = record.pisu; } - */ } - return; + any_char = alphabet(LRX_PROCESSOR_TAG_ANY_CHAR); + any_tag = alphabet(LRX_PROCESSOR_TAG_ANY_TAG); + any_upper = alphabet(LRX_PROCESSOR_TAG_ANY_UPPER); + any_lower = alphabet(LRX_PROCESSOR_TAG_ANY_LOWER); + word_boundary = alphabet(LRX_PROCESSOR_TAG_WORD_BOUNDARY); } void LRXProcessor::init() { - initial_state->init(transducer.getInitial()); - - anfinals.insert(transducer.getFinals().begin(), transducer.getFinals().end()); + anfinals.insert(&transducer); + initial_state->init(anfinals); escaped_chars.insert('['); escaped_chars.insert(']'); @@ -162,13 +217,12 @@ LRXProcessor::recognisePattern(const UString lu, const UString op) return false; } + set exes; + exes.insert(&recognisers[op]); State *first_state = new State(); - first_state->init(recognisers[op].getInitial()); + first_state->init(exes); State cur = *first_state; - map end_states; - end_states.insert(recognisers[op].getFinals().begin(), recognisers[op].getFinals().end()); - bool readingTag = false; UString tag; int val = 0; @@ -249,12 +303,7 @@ LRXProcessor::recognisePattern(const UString lu, const UString op) cerr << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"; } */ - if(cur.isFinal(end_states)) - { - return true; - } - - return false; + return cur.isFinal(exes); } void diff --git a/src/lrx_processor.h b/src/lrx_processor.h index 1a03d86..a865777 100644 --- a/src/lrx_processor.h +++ b/src/lrx_processor.h @@ -23,11 +23,10 @@ #include #include -#include - -#include +#include #include -#include +#include +#include #include using namespace std; @@ -36,21 +35,22 @@ class LRXProcessor { private: - Alphabet alphabet; - TransExe transducer; - map recognisers; + StringWriter str_write; + AlphabetExe alphabet; + TransducerExe transducer; + map recognisers; map weights; vector alive_states; - map anfinals; + set anfinals; set escaped_chars; State *initial_state; - bool traceMode; - bool debugMode; - bool nullFlush; - bool outOfWord; + bool traceMode = false; + bool debugMode = false; + bool nullFlush = false; + bool outOfWord = true; int32_t any_char; int32_t any_upper; @@ -58,8 +58,12 @@ private: int32_t any_tag; int32_t word_boundary; - unsigned int pos; - unsigned long lineno; + unsigned int pos = 0; + unsigned long lineno = 1; // Used for rule tracing + + bool mmapping = false; + void* mmap_pointer = nullptr; + int mmap_len = 0; UString itow(int i); bool recognisePattern(const UString lu, const UString op); diff --git a/src/weight.h b/src/weight.h deleted file mode 100644 index cc12458..0000000 --- a/src/weight.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (C) 2011--2012 Universitat d'Alacant - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#ifndef __WEIGHT_H__ -#define __WEIGHT_H__ - -#include -#include -#include - -struct weight { - int32_t id; - char _pad[4]{}; - double pisu; -}; - -// This should all be optimized out on little-endian archs - -template -inline uint64_t U64(T t) { - return static_cast(t); -} - -inline void weight_to_le(weight& w) { - uint32_t id = static_cast(w.id); - uint8_t *bytes = reinterpret_cast(&w.id); - bytes[3] = (id >> 24) & 0xFF; - bytes[2] = (id >> 16) & 0xFF; - bytes[1] = (id >> 8) & 0xFF; - bytes[0] = id & 0xFF; - - bytes = reinterpret_cast(&w.pisu); - uint64_t pisu = *reinterpret_cast(&w.pisu); - bytes[7] = (pisu >> 56) & 0xFF; - bytes[6] = (pisu >> 48) & 0xFF; - bytes[5] = (pisu >> 40) & 0xFF; - bytes[4] = (pisu >> 32) & 0xFF; - bytes[3] = (pisu >> 24) & 0xFF; - bytes[2] = (pisu >> 16) & 0xFF; - bytes[1] = (pisu >> 8) & 0xFF; - bytes[0] = pisu & 0xFF; -} - -inline void weight_from_le(weight& w) { - uint32_t id = static_cast(w.id); - uint8_t *bytes = reinterpret_cast(&id); - id = (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | bytes[0]; - w.id = static_cast(id); - - bytes = reinterpret_cast(&w.pisu); - uint64_t pisu = (U64(bytes[7]) << 56ull) | (U64(bytes[6]) << 48ull) | (U64(bytes[5]) << 40ull) | (U64(bytes[4]) << 32ull) | (U64(bytes[3]) << 24ull) | (U64(bytes[2]) << 16ull) | (U64(bytes[1]) << 8ull) | U64(bytes[0]); - w.pisu = *reinterpret_cast(&pisu); -} - -#endif /* __WEIGHT_H__ */