commit 088b4754dcd560fb9c3b5d8994d43b8183c07fa0 Author: Alexandra Date: Wed Jul 5 10:28:24 2023 -0400 pos_wordform_updates diff --git a/lang_data/flextext_word_by_pos_extractor.py b/lang_data/flextext_word_by_pos_extractor.py new file mode 100644 index 0000000..32352c6 --- /dev/null +++ b/lang_data/flextext_word_by_pos_extractor.py @@ -0,0 +1,31 @@ +import sys +import csv +import xml.etree.ElementTree as ET +word_by_pos = {} +#Run as python3 flextext_wordform_extractor.py input_files[any number] out_file. +with open(sys.argv[-1], "w", newline="") as csvfile: + fieldnames = ['Wordform', 'POS'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + for i in range(1, len(sys.argv)-1): + tree = ET.parse(sys.argv[i]) + print("working with file", i) + root = tree.getroot() + #save information about which language for future use + lang = [x.get("lang") for x in root.findall('.//languages/language/[@font="Charis SIL"]')][0] + for word in root.findall('.//phrases/phrase/words/word'): + for item in word.findall('./item'): + if item.get("type") == 'txt': + surface = item.text + if word.findall('./item[@type="pos"]') != []: + if item.get("type") == 'pos': + pos = item.text + word_by_pos[surface] = pos + else: + word_by_pos[surface] = "misc" + print(len(word_by_pos)) + writer.writeheader() + for word in word_by_pos.items(): + print(word) + writer.writerow({'Wordform':word[0], 'POS':word[1]}) + # for form in wordforms: + # writer.writerow({'Wordform': form}) diff --git a/lang_data/flextext_wordform_extractor.py b/lang_data/flextext_wordform_extractor.py index 8de8dc1..3470448 100644 --- a/lang_data/flextext_wordform_extractor.py +++ b/lang_data/flextext_wordform_extractor.py @@ -1,59 +1,27 @@ import sys import csv import xml.etree.ElementTree as ET +#This program extracts the surface wordforms in any number of flextext files +#Run as python3 flextext_wordform_extractor.py input_files[any number] out_file.csv -#Run as python3 flextext_wordform_extractor.py input_file -tree = ET.parse(sys.argv[1]) - -root = tree.getroot() -#with open(sys.argv[2], "w", newline = "") as csvfile: -# fieldnames = ['Wordform', 'Morpheme'] -# writer = csv.DictWriter(csvfile, fieldnames=fieldnames) -for form in root.findall('.//phrases/phrase'): - for gloss in form.findall(".//*[@type='gls']"): - if 'en' in gloss.get("lang"): - print("eng_gloss",gloss.text) - elif 'es' in gloss.get("lang"): - print("es_gloss",gloss.text) - for wordform in form.findall('.//word/item'): - if 'amu-fonipa' in wordform.get("lang"): - #target language wordform - print("tl:", wordform.text) - for hmm in wordform.findall('.//morphemes/morph/item'): - if 'txt' in hmm.get("type"): - print("tl_morph!:", hmm.text) - for morpheme in form.findall('.//morphemes/morph/item'): - if 'amu-fonipa' in morpheme.get("lang"): - if 'txt' in morpheme.get("type"): - #target language morpheme - print("tl_morph:", morpheme.text) - elif 'es' in morpheme.get("lang"): - # morpheme in spanish - print("es_morph", morpheme.text) - # for gloss in form: - # if 'en' in wordform.get("lang"): - # print(gloss.text) - # elif 'es' in wordform.get("lang"): - # print(gloss.text) -# for morpheme in root.findall('.//morphemes/morph/item'): -# if 'amu-fonipa' in morpheme.get("lang") and 'txt' in morpheme.get("type"): -# print(morpheme.text) -# for wordform in root.findall('.//word/item'): -# if 'amu-fonipa' in wordform.get("lang"): -# print("tl:", wordform.text) -#for phrase in root.iter("phrase"): - # for word_index in phrase.iter("word"): - # for child in word_index: - # if child.tag == 'morphemes': - # print(child.find("item")) - # for morpheme in child: - # print(morpheme) -# for words in root.iter("words"): -# for word in words: -# print(word.attrib) -# items = word.findall('item') -# for item in items: -# if 'amu-fonipa' in item.get('lang'): -# print(item.text) -# elif 'pos' in item.get('type'): -# print(item.text) \ No newline at end of file +with open(sys.argv[-1], "w", newline="") as csvfile: + fieldnames = ['Wordform', 'POS'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + for i in range(1, len(sys.argv)-1): + tree = ET.parse(sys.argv[i]) + print("working with file", i) + root = tree.getroot() + wordforms = [] + word_by_pos = {} + lang = [x.get("lang") for x in root.findall('.//languages/language/[@font="Charis SIL"]')][0] + # for morpheme in root.findall('.//morphemes/morph/item'): + #This narrows forms to all those found in each phrase + for word in root.findall('.//phrases/phrase/words/word/item'): + # extract the target-language wordform, assign to key + if 'txt' in word.get("type"): + if word.text not in wordforms: + wordforms.append(word.text) + else: + continue + for form in wordforms: + writer.writerow({'Wordform': form}) \ No newline at end of file