commit 9ba00c8982bfb1e9060190badc8059dbf7b5c5af Author: Alexandra Date: Wed Jul 12 17:41:44 2023 -0400 corpus_and_lexd_file_generation diff --git a/lang_data/amu-fonipa.lexd b/lang_data/amu-fonipa.lexd new file mode 100644 index 0000000..20c81e4 --- /dev/null +++ b/lang_data/amu-fonipa.lexd @@ -0,0 +1,120 @@ +PATTERNS +vproc +sust +conj +part +prep +adv +vest +adj +pron +misc +spos +vtr +vintr + +LEXICON vproc +:kwinckwii + + +LEXICON sust +:nnʔã +:tmaãʔ +:ɲõndaa +:ɲʔoõ +:ndɛɛ +:ti +:ɲʔoõ-ja +:ndɛ +:hndɛ + + +LEXICON conj +:kancha +:sɛɛ +:hoʔ +:ndoʔ +:meiĩ + + +LEXICON part +:na + + +LEXICON prep +:nakiiʔ + + +LEXICON adv +:lhooɲe +:maheʔnɟo +:ɲhaã +:juu +:tʃaʔʃhẽ +:lhoo +:maʃhẽ +:ɲhõ + + +LEXICON vest +:hnɟu +:mʔaã +:matʃʔeenaʔ +:nckwiiʔ + + +LEXICON adj +:kwii +:hnde + + +LEXICON pron +:ha +:hoonaʔ +:hoona + + +LEXICON misc +:hnɟu-ja +:Jesus +:Santana +:mʔaã-ja +:grabacion +:programa +:Alianza +:de +:Lenguas +:en +:Peligro +:proyekto +:waa-ɲe +:makiaʔ +:luu +:lʔuu +:mʔa +:nda-ja +:ʃo +:waa-ne +:ne +:wa +:nggwana +:asta + + +LEXICON spos +:lote-ja +:tsʔiaãʔ +:nda + + +LEXICON vtr +:kwilʔuee-na +:kwilaʔneĩ +:laʔneĩ-na +:kwilaʔthõ-nɟɔ +:laʔlheii-na + + +LEXICON vintr +:kwilʔa-ja + diff --git a/lang_data/corpus.txt b/lang_data/corpus.txt new file mode 100644 index 0000000..98ac979 --- /dev/null +++ b/lang_data/corpus.txt @@ -0,0 +1,22 @@ +ha hnɟu-ja Jesus Santana +mʔaã-ja lhooɲe na maheʔnɟo +kwilʔa-ja grabacion ɲhaã nakiiʔ +programa na hnɟu Alianza +de Lenguas en Peligro +kwii proyekto +juu na nnʔã na mʔaã nakiiʔ tmaãʔ waa-ɲe +na kwilʔuee-na +nnʔã na kwilaʔneĩ ɲõndaa +makiaʔ ɲʔoõ luu +kancha na na hnde nnʔã kwilaʔneĩ +ɲʔoõ lʔuu tʃaʔʃhẽ na lote-ja lʔuu +hoonaʔ ndɛɛ laʔneĩ-na sɛɛ ha mʔa nda-ja +ti ʃo ndɛɛ laʔneĩ-na hoʔ na matʃʔeenaʔ na +kwinckwii lʔuu ɲʔoõ-ja +ndoʔ hoʔ tsʔiaãʔ tmaãʔ waa-ne +na mʔaã na lhoo ne +kwilaʔthõ-nɟɔ lhoo ne +na nda-ja +ndoʔ nda hoona na ti nckwiiʔ ɲʔoõ wa maʃhẽ nggwana lʔuu na ndɛ laʔneĩ-na asta hndɛ laʔlheii-na +ɲhõ meiĩ hoʔ na +kwilʔa-ja grabacion na maheʔnɟo diff --git a/lang_data/flextext_word_by_pos_extractor.py b/lang_data/flextext_word_by_pos_extractor.py index 1577094..06b2898 100644 --- a/lang_data/flextext_word_by_pos_extractor.py +++ b/lang_data/flextext_word_by_pos_extractor.py @@ -1,38 +1,46 @@ import sys -import csv import xml.etree.ElementTree as ET word_by_pos = {} -corpus = [] -#Run as python3 flextext_wordform_extractor.py input_files[any number] out_file. -with open(sys.argv[-1], "w", newline="") as csvfile: - fieldnames = ['Wordform', 'POS'] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - for i in range(1, len(sys.argv)-1): - tree = ET.parse(sys.argv[i]) - print("working with file", i) - root = tree.getroot() - #save information about which language for future use - lang = [x.get("lang") for x in root.findall('.//languages/language/[@font="Charis SIL"]')][0] - for word in root.findall('.//phrases/phrase/words/word'): - for item in word.findall('./item'): - if item.get("type") == 'txt': - surface = item.text - if word.findall('./item[@type="pos"]') != []: - if item.get("type") == 'pos': - pos = item.text - word_by_pos[surface] = pos - else: - word_by_pos[surface] = "misc" - #print(len(word_by_pos)) - writer.writeheader() - for word in word_by_pos.items(): - #print(word) - writer.writerow({'Wordform':word[0], 'POS':word[1]}) - for phrases in root.findall('.//phrases/phrase'): - phrase = '' - #for word in phrase.findall() - for words in phrases.findall('./words/word/item[@lang={{lang}}]'): - phrase = phrase + words.text + " " - corpus.append(phrase.strip(" ")) - #export to a corpus file - print(corpus) +#Run as python3 flextext_wordform_extractor.py input_files[any number] +for i in range(1, len(sys.argv)): + tree = ET.parse(sys.argv[i]) + print("working with file", i) + root = tree.getroot() + #save information about which language for specifying language + lang = [x.get("lang") for x in root.findall('.//languages/language/[@font="Charis SIL"]')][0] + lexd = open('%s.lexd' % lang, 'w') + morphlexd = open('morph_%s.lexd' % lang, 'w') + corpus = open('%s_corpus.txt' % lang, "w") + #Loop through the words in each phrase + for word in root.findall('.//phrases/phrase/words/word'): + for item in word.findall('./item'): + #for wordform in target language, get text + if item.get("type") == 'txt' and item.get("lang") == lang: + surface = item.text + #extract any available pos information about the wordform + if word.findall('./item[@type="pos"]') != []: + if item.get("type") == 'pos': + pos = item.text + word_by_pos[surface] = pos + #or else assign miscellaneous POS + else: + word_by_pos[surface] = "misc" +pos_tags = list(set(word_by_pos.values())) +#temporarily default patterns to part of speech until further steps in the project +lexd.write("PATTERNS\n") +[lexd.write(pos_tags[x].replace(".", "").replace(" ", "") + "\n") for x in range(len(pos_tags))] +for pos_tag in pos_tags: + lexd.write("\nLEXICON " + pos_tag.replace(".", "").replace(" ", "") + "\n") + for entry in word_by_pos.items(): + if pos_tag in entry: + lexd.write("<" + pos_tag.replace(".", "").replace(" ", "") + ">:" + entry[0] + "\n") + lexd.write("\n") +#this loop extracts all phrases to a corpus file +for phrases in root.findall('.//phrases/phrase'): + iso_path = './words/word/item/[@lang="' + lang + '"]' + phrase = '' + #for word in phrase.findall() + for words in phrases.findall(iso_path): + phrase = phrase + words.text + " " + corpus.write(phrase.strip(" ") +"\n") +corpus.close()