commit 2590f36215178d951a335216007a06dc1954d1bd Author: Alexandra Date: Thu Jul 6 20:26:34 2023 -0400 phrase_pulling_update diff --git a/lang_data/flextext_word_by_pos_extractor.py b/lang_data/flextext_word_by_pos_extractor.py index 32352c6..1577094 100644 --- a/lang_data/flextext_word_by_pos_extractor.py +++ b/lang_data/flextext_word_by_pos_extractor.py @@ -2,6 +2,7 @@ import sys import csv import xml.etree.ElementTree as ET word_by_pos = {} +corpus = [] #Run as python3 flextext_wordform_extractor.py input_files[any number] out_file. with open(sys.argv[-1], "w", newline="") as csvfile: fieldnames = ['Wordform', 'POS'] @@ -22,10 +23,16 @@ with open(sys.argv[-1], "w", newline="") as csvfile: word_by_pos[surface] = pos else: word_by_pos[surface] = "misc" - print(len(word_by_pos)) + #print(len(word_by_pos)) writer.writeheader() for word in word_by_pos.items(): - print(word) + #print(word) writer.writerow({'Wordform':word[0], 'POS':word[1]}) - # for form in wordforms: - # writer.writerow({'Wordform': form}) + for phrases in root.findall('.//phrases/phrase'): + phrase = '' + #for word in phrase.findall() + for words in phrases.findall('./words/word/item[@lang={{lang}}]'): + phrase = phrase + words.text + " " + corpus.append(phrase.strip(" ")) + #export to a corpus file + print(corpus)