Index: incubator/apertium-bel-rus/dev/testvoc/bidix-unknowns.sh =================================================================== --- incubator/apertium-bel-rus/dev/testvoc/bidix-unknowns.sh (nonexistent) +++ incubator/apertium-bel-rus/dev/testvoc/bidix-unknowns.sh (revision 66830) @@ -0,0 +1,98 @@ +#!/bin/bash + +set -e -u + +if [[ $# -eq 1 ]]; then + lang=$1 + monodix=guess + bidix=guess + side=guess +elif [[ $# -eq 4 ]]; then + lang=$1 + monodix=$2 + bidix=$3 + side=$4 +else + cat >&2 <&2 +lt-expand "${monodix}" \ + | grep -ve __REGEXP__ \ + | sed 's/[^:]*//; s/\(<.*>\)\(#.*\)/\2\1/' \ + | LC_ALL=C sort -u >"${exp}" + +in_mono () { + # bidix has prefixes of monodix, have to use look instead of comm :-/ + LC_ALL=C look "$1" "${exp}" >/dev/null +} +echo "Expanding bidix and checking for entries missing from monodix …" >&2 +lt-expand "${bidix}" \ + | awk -vside="${side}" -F':|:[<>]:' ' + BEGIN { + if(side=="l") { + nside=1 + LR=":>:" + RL=":<:" + } + else { + nside=2 + LR=":<:" # flip it + RL=":>:" # and reverse + } + } + # Make bidix match up with monodix (left=left, right=right): + /:>:/ { print LR $nside; next } + /:<:/ { print RL $nside; next } + /:/ { print ":"$nside } +' \ + | while read -r bientry; do + # Bidix now normalised to have the requested monodix on the "left" + case ${bientry} in + ":>:"* ) # If it's LR in bidix, then we check if unmarked / LR is in monodix + in_mono "${bientry##:>}" || in_mono "${bientry}" || echo "${bientry}" + ;; + ":<:"* ) # If it's RL in bidix, then we check if unmarked / RL is in monodix + in_mono "${bientry##:<}" || in_mono "${bientry}" || echo "${bientry}" + ;; + ":"* ) # If it's unmarked in bidix, then we check if unmarked / LR / RL in monodix + in_mono "${bientry}" || in_mono ":>${bientry}" || in_mono ":<${bientry}" || echo "${bientry}" + ;; + *) echo "ERROR: unexpected bientry format: ${bientry}" >&2;; + esac + done Index: incubator/apertium-bel-rus/dev/testvoc/generation.sh =================================================================== --- incubator/apertium-bel-rus/dev/testvoc/generation.sh (nonexistent) +++ incubator/apertium-bel-rus/dev/testvoc/generation.sh (revision 66830) @@ -0,0 +1,98 @@ +#!/bin/bash + +set -e -u + +if [[ $# -eq 1 ]]; then + mode=$1 + dix=guess +elif [[ $# -eq 2 ]]; then + mode=$1 + dix=$2 +else + cat >&2 <"clb"$" + }' + # give the "disambiguated" output, no forms +} + +split_ambig () { + if command -V pypy3 &>/dev/null; then + python=pypy3 + else + python=python3 + fi + PYTHONPATH="$(dirname "$0"):${PYTHONPATH:-}" "${python}" -c ' +from streamparser import parse_file, readingToString +import sys +for blank, lu in parse_file(sys.stdin, withText=True): + print(blank+" ".join("^{}/{}$".format(lu.wordform, readingToString(r)) + for r in lu.readings), + end="")' + +} + +mode_after_analysis () +{ + eval $(grep '|' "$1" |\ + sed 's/[^|]*|//' |\ + sed 's/.*apertium-pretransfer/apertium-pretransfer/' |\ + sed 's/lt-proc -p[^|]*/cat/' |\ + sed 's/autobil.bin *|/& split_ambig |/' |\ + sed 's/\$1/-d/g;s/\$2//g') + # lt-proc -p fails +} + +only_errs () { + grep '][^<]*[#/]' +} + + +lang1=${mode%%-*} + +if [[ ${dix} = guess ]]; then + lang1dir=$(grep -m1 "^AP_SRC.*apertium-${lang1}" config.log | sed "s/^[^=]*='//;s/'$//") + dix=${lang1dir}/apertium-${lang1}.${lang1}.dix +fi + +clb="" +case ${lang1} in + nno|nob) clb="" ;; +esac + +# Make it possible to edit the .dix while testvoc is running: +dixtmp=$(mktemp -t gentestvoc.XXXXXXXXXXX) +trap 'rm -f "${dixtmp}"' EXIT +cat "${dix}" > "${dixtmp}" + +analysis_expansion "${mode}" "${clb}" \ + | mode_after_analysis modes/"${mode}".mode \ + | only_errs Index: incubator/apertium-bel-rus/dev/testvoc/streamparser.py =================================================================== --- incubator/apertium-bel-rus/dev/testvoc/streamparser.py (nonexistent) +++ incubator/apertium-bel-rus/dev/testvoc/streamparser.py (revision 66830) @@ -0,0 +1,143 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +""" +Usage: streamparser.py [FILE] + +Consumes input from a file (first argument) or stdin, parsing and pretty printing the readings of lexical units found. +""" + +import re, pprint, sys, itertools, fileinput +from collections import namedtuple + +SReading = namedtuple('SReading', ['baseform', 'tags']) + +def subreadingToString(sub): + return sub.baseform+"".join("<"+t+">" for t in sub.tags) + +def readingToString(reading): + return "+".join(subreadingToString(sub) for sub in reading) + +def mainpos(reading, ltr=False): + """Return the first part-of-speech tag of a reading. If there are + several subreadings, by default give the first tag of the last + subreading. If ltr=True, give the first tag of the first + subreading, see + http://beta.visl.sdu.dk/cg3/single/#sub-stream-apertium for more + information. + + """ + if ltr: + return reading[0].tags[0] + else: + return reading[-1].tags[0] + +class LexicalUnit: + + """A lexical unit consisting of a lemma and its readings. + + Attributes: + lexicalUnit (str): The lexical unit in Apertium stream format. + wordform (str): The word form (surface form) of the lexical unit. + readings (list of list of SReading): The analyses of the lexical unit with sublists containing all subreadings. + """ + + def __init__(self, lexicalUnit): + self.lexicalUnit = lexicalUnit + + cohort = re.split(r'(?]+>)+)', reading) + for subreading in subreadingParts: + baseform = subreading[0].lstrip('+') + tags = re.findall(r'<([^>]+)>', subreading[1]) + + subreadings.append(SReading(baseform=baseform, tags=tags)) + + self.readings.append(subreadings) + + def __repr__(self): + return self.lexicalUnit + + +def parse(stream, withText=False): + """Generates lexical units from a character stream. + + Args: + stream (iterable): A character stream containing lexical units, superblanks and other text. + withText (bool, optional): A boolean defining whether to output preceding text with each lexical unit. + + Yields: + LexicalUnit: The next lexical unit found in the character stream. (if withText is False) + (str, LexicalUnit): The next lexical unit found in the character stream and the the text that seperated it from the prior unit in a tuple. (if withText is True) + """ + + buffer = '' + textBuffer = '' + inLexicalUnit = False + inSuperblank = False + + for char in stream: + + if inSuperblank: + if char == ']': + inSuperblank = False + textBuffer += char + elif char == '\\': + textBuffer += char + textBuffer += next(stream) + else: + textBuffer += char + elif inLexicalUnit: + if char == '$': + if withText: + yield (textBuffer, LexicalUnit(buffer)) + else: + yield LexicalUnit(buffer) + buffer = '' + textBuffer = '' + inLexicalUnit = False + elif char == '\\': + buffer += char + buffer += next(stream) + else: + buffer += char + else: + if char == '[': + inSuperblank = True + textBuffer += char + elif char == '^': + inLexicalUnit = True + elif char == '\\': + textBuffer += char + textBuffer += next(stream) + else: + textBuffer += char + + +def parse_file(f, withText=False): + """Generates lexical units from a file. + + Args: + f (file): A file containing lexical units, superblanks and other text. + + Yields: + LexicalUnit: The next lexical unit found in the file. + """ + + return parse(itertools.chain.from_iterable(f), withText) + + +if __name__ == '__main__': + lexicalUnits = parse_file(fileinput.input()) + + for lexicalUnit in lexicalUnits: + pprint.pprint(lexicalUnit.readings, width=120) Index: incubator/apertium-bel-rus/apertium-bel-rus.bel-rus.t1x =================================================================== --- incubator/apertium-bel-rus/apertium-bel-rus.bel-rus.t1x (revision 66827) +++ incubator/apertium-bel-rus/apertium-bel-rus.bel-rus.t1x (revision 66830) @@ -152,6 +152,7 @@ + Index: incubator/apertium-bel/apertium-bel.bel.dix =================================================================== --- incubator/apertium-bel/apertium-bel.bel.dix (revision 66827) +++ incubator/apertium-bel/apertium-bel.bel.dix (revision 66830) @@ -86,6 +86,7 @@ + @@ -93,7 +94,7 @@ [0-9]+([.,][0-9]+)?

- [0-9]+([.,][0-9]+)? ?%

+ [0-9]+([.,][0-9]+)? ?%

[0-9]+([.,][0-9]+)?-[0-9]+([.,][0-9]+)?%?

[0-9]+([.,][0-9]+)?[:.][0-9]+([.,][0-9]+)?