Index: incubator/apertium-fao-nor/apertium-fao-nor.fao-nor.dix =================================================================== --- incubator/apertium-fao-nor/apertium-fao-nor.fao-nor.dix (revision 79333) +++ incubator/apertium-fao-nor/apertium-fao-nor.fao-nor.dix (revision 79334) @@ -980,8 +980,8 @@

eldurild

eldureld

einginingen

-

einen

-

einein

+

einen

+

einein

egjeg

egeg

EffersøeEffersøe

Index: incubator/apertium-fao-nor/generation.sh =================================================================== --- incubator/apertium-fao-nor/generation.sh (nonexistent) +++ incubator/apertium-fao-nor/generation.sh (revision 79334) @@ -0,0 +1,206 @@ +#!/bin/bash + +set -e -u + +# You can override the below variables by doing e.g. +# $ export CYCLES=1 BLOCK=1M +# before running this script. + +# How many times to follow cycle when expanding with --hfst; gets slow if too high: +declare -ir CYCLES=${CYCLES-0} +# How many parallel pipelines to run (requires GNU parallel installed; +# only worth increasing if CPU's are not saturated and there's free +# RAM while running): +declare -ir J=${J-1} +# How much data to translate before restarting the pipeline (some +# pipelines have memory leaks and need restarting every so often): +declare -r BLOCK=${BLOCK:-100M} + + +if [[ $# -ge 1 && $1 = --hfst ]]; then + HFST=true + shift +else + HFST=false +fi + +if [[ $# -eq 1 ]]; then + mode=$1 + dix=guess +elif [[ $# -eq 2 ]]; then + mode=$1 + dix=$2 +else + cat >&2 <]:' ' + /:<:/ {next} + $2 ~ /|DUE_TO_LT_PROC_HANG|__REGEXP__/ {next} + { + esc=$2 + gsub("/","\\/",esc) + gsub("^","\\^",esc) + gsub("$","\\$",esc) + print "["esc"] ^"$1"/"$2"$ ^./."clb"$" + }' +} + +analyser_to_hfst () { + case "$(head -c4 "$1")" in + HFST) + hfst-fst2fst -t "$1" + ;; + *) # lttoolbox bin's start with their 's :( + lt-print "$1" \ + | sed 's/ /@_SPACE_@/g' \ + | hfst-txt2fst -e ε + ;; + esac +} + +analysis_expansion_hfst () { + analyser_to_hfst "$1" \ + | hfst-project -p lower \ + | hfst-fst2strings -c"${CYCLES}" \ + | awk -v clb="$2" ' + /[][$^{}\\]/{next} # skip escaping hell + /|DUE_TO_LT_PROC_HANG|__REGEXP__/ {next} + { + gsub("]","\\]") + esc=$0 + gsub("/","\\/",esc) + gsub("^","\\^",esc) + gsub("$","\\$",esc) + print "["esc"] ^"$0"$ ^."clb"$" + }' + # give the "disambiguated" output, no forms +} + +only_errs () { + if [[ $# -ge 1 && $1 = --no-@ ]]; then + atfilter () { grep -v '].*/@'; } + else + atfilter () { cat; } + fi + # turn escaped SOLIDUS into DIVISION SLASH, so we don't grep correct stuff ("A/S" is a possible lemma) + sed 's%\\/%∕%g' |\ + atfilter |\ + grep '][^<]*[#/]' +} + +run_mode () { + if command -V parallel &>/dev/null; then + parallel -j"$J" --pipe --block "${BLOCK}" -- bash "$@" + else + bash "$@" + fi +} + +declare -a TMPFILES +cleanup () { + for f in "${TMPFILES[@]}"; do + rm -f "$f" + done +} +trap 'cleanup' EXIT + + +PYTHONPATH="$(dirname "$0"):${PYTHONPATH:-}" +export PYTHONPATH +if command -V pypy3 &>/dev/null; then + python=pypy3 +else + python=python3 +fi +split_ambig=$(mktemp -t gentestvoc.XXXXXXXXXXX) +TMPFILES+=("${split_ambig}") +cat >"${split_ambig}" < "${mode_after_analysis}" + +mode_after_tagger=$(mktemp -t gentestvoc.XXXXXXXXXXX) +TMPFILES+=("${mode_after_tagger}") +grep '|' modes/"${mode}".mode \ + | sed 's/[^|]*|//' \ + | sed 's/.*apertium-pretransfer/apertium-pretransfer/' \ + | sed 's/lt-proc -p[^|]*/cat/' \ + | sed "s%autobil.bin'* *|%& ${split_ambig} |%" \ + | sed 's/\$1/-d/g;s/\$2//g' \ + > "${mode_after_tagger}" +# lt-proc -p fails, that's why we remove that + + +lang1=${mode%%-*} + +clb="" +case ${lang1} in + nno|nob) clb="" ;; +esac + +if $HFST; then + if [[ ${dix} = guess ]]; then + dix=$(xmllint --xpath "string(/modes/mode[@name = '${mode}']/pipeline/program[1]/file[1]/@name)" modes.xml) + fi + analysis_expansion_hfst "${dix}" "${clb}" \ + | run_mode "${mode_after_tagger}" \ + | only_errs +else + if [[ ${dix} = guess ]]; then + lang1dir=$(grep -m1 "^AP_SRC.*apertium-${lang1}" config.log | sed "s/^[^=]*='//;s/'$//") + dix=${lang1dir}/apertium-${lang1}.${lang1}.dix + fi + # Make it possible to edit the .dix while testvoc is running: + dixtmp=$(mktemp -t gentestvoc.XXXXXXXXXXX) + TMPFILES+=("${dixtmp}") + cat "${dix}" > "${dixtmp}" + analysis_expansion "${dixtmp}" "${clb}" \ + | run_mode "${mode_after_analysis}" \ + | only_errs --no-@ +fi Property changes on: incubator/apertium-fao-nor/generation.sh ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property