commit 9ba00c8982bfb1e9060190badc8059dbf7b5c5af
Author: Alexandra <aconeil@iu.edu>
Date:   Wed Jul 12 17:41:44 2023 -0400

    corpus_and_lexd_file_generation

diff --git a/lang_data/amu-fonipa.lexd b/lang_data/amu-fonipa.lexd
new file mode 100644
index 0000000..20c81e4
--- /dev/null
+++ b/lang_data/amu-fonipa.lexd
@@ -0,0 +1,120 @@
+PATTERNS
+vproc
+sust
+conj
+part
+prep
+adv
+vest
+adj
+pron
+misc
+spos
+vtr
+vintr
+
+LEXICON vproc
+<vproc>:kwinckwii
+
+
+LEXICON sust
+<sust>:nnʔã
+<sust>:tmaãʔ
+<sust>:ɲõndaa
+<sust>:ɲʔoõ
+<sust>:ndɛɛ
+<sust>:ti
+<sust>:ɲʔoõ-ja
+<sust>:ndɛ
+<sust>:hndɛ
+
+
+LEXICON conj
+<conj>:kancha
+<conj>:sɛɛ
+<conj>:hoʔ
+<conj>:ndoʔ
+<conj>:meiĩ
+
+
+LEXICON part
+<part>:na
+
+
+LEXICON prep
+<prep>:nakiiʔ
+
+
+LEXICON adv
+<adv>:lhooɲe
+<adv>:maheʔnɟo
+<adv>:ɲhaã
+<adv>:juu
+<adv>:tʃaʔʃhẽ
+<adv>:lhoo
+<adv>:maʃhẽ
+<adv>:ɲhõ
+
+
+LEXICON vest
+<vest>:hnɟu
+<vest>:mʔaã
+<vest>:matʃʔeenaʔ
+<vest>:nckwiiʔ
+
+
+LEXICON adj
+<adj>:kwii
+<adj>:hnde
+
+
+LEXICON pron
+<pron>:ha
+<pron>:hoonaʔ
+<pron>:hoona
+
+
+LEXICON misc
+<misc>:hnɟu-ja
+<misc>:Jesus
+<misc>:Santana
+<misc>:mʔaã-ja
+<misc>:grabacion
+<misc>:programa
+<misc>:Alianza
+<misc>:de
+<misc>:Lenguas
+<misc>:en
+<misc>:Peligro
+<misc>:proyekto
+<misc>:waa-ɲe
+<misc>:makiaʔ
+<misc>:luu
+<misc>:lʔuu
+<misc>:mʔa
+<misc>:nda-ja
+<misc>:ʃo
+<misc>:waa-ne
+<misc>:ne
+<misc>:wa
+<misc>:nggwana
+<misc>:asta
+
+
+LEXICON spos
+<spos>:lote-ja
+<spos>:tsʔiaãʔ
+<spos>:nda
+
+
+LEXICON vtr
+<vtr>:kwilʔuee-na
+<vtr>:kwilaʔneĩ
+<vtr>:laʔneĩ-na
+<vtr>:kwilaʔthõ-nɟɔ
+<vtr>:laʔlheii-na
+
+
+LEXICON vintr
+<vintr>:kwilʔa-ja
+
diff --git a/lang_data/corpus.txt b/lang_data/corpus.txt
new file mode 100644
index 0000000..98ac979
--- /dev/null
+++ b/lang_data/corpus.txt
@@ -0,0 +1,22 @@
+ha hnɟu-ja Jesus Santana
+mʔaã-ja lhooɲe na maheʔnɟo
+kwilʔa-ja grabacion ɲhaã nakiiʔ
+programa na hnɟu Alianza
+de Lenguas en Peligro
+kwii proyekto
+juu na nnʔã na mʔaã nakiiʔ tmaãʔ waa-ɲe
+na kwilʔuee-na
+nnʔã na kwilaʔneĩ ɲõndaa
+makiaʔ ɲʔoõ luu
+kancha na na hnde nnʔã kwilaʔneĩ
+ɲʔoõ lʔuu tʃaʔʃhẽ na lote-ja lʔuu
+hoonaʔ ndɛɛ laʔneĩ-na sɛɛ ha mʔa nda-ja
+ti ʃo ndɛɛ laʔneĩ-na hoʔ na matʃʔeenaʔ na
+kwinckwii lʔuu ɲʔoõ-ja
+ndoʔ hoʔ tsʔiaãʔ tmaãʔ waa-ne
+na mʔaã na lhoo ne
+kwilaʔthõ-nɟɔ lhoo ne
+na nda-ja
+ndoʔ nda hoona na ti nckwiiʔ ɲʔoõ wa maʃhẽ nggwana lʔuu na ndɛ laʔneĩ-na asta hndɛ laʔlheii-na
+ɲhõ meiĩ hoʔ na
+kwilʔa-ja grabacion na maheʔnɟo
diff --git a/lang_data/flextext_word_by_pos_extractor.py b/lang_data/flextext_word_by_pos_extractor.py
index 1577094..06b2898 100644
--- a/lang_data/flextext_word_by_pos_extractor.py
+++ b/lang_data/flextext_word_by_pos_extractor.py
@@ -1,38 +1,46 @@
 import sys
-import csv
 import xml.etree.ElementTree as ET
 word_by_pos = {}
-corpus = []
-#Run as python3 flextext_wordform_extractor.py input_files[any number] out_file.
-with open(sys.argv[-1], "w", newline="") as csvfile:
-    fieldnames = ['Wordform', 'POS']
-    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
-    for i in range(1, len(sys.argv)-1):
-        tree = ET.parse(sys.argv[i])
-        print("working with file", i)
-        root = tree.getroot()
-        #save information about which language for future use
-        lang = [x.get("lang") for x in root.findall('.//languages/language/[@font="Charis SIL"]')][0]
-        for word in root.findall('.//phrases/phrase/words/word'):
-            for item in word.findall('./item'):
-                if item.get("type") == 'txt':
-                    surface = item.text
-                if word.findall('./item[@type="pos"]') != []:
-                    if item.get("type") == 'pos':
-                        pos = item.text
-                        word_by_pos[surface] = pos
-                else:
-                    word_by_pos[surface] = "misc"
-    #print(len(word_by_pos))
-    writer.writeheader()
-    for word in word_by_pos.items():
-        #print(word)
-        writer.writerow({'Wordform':word[0], 'POS':word[1]})
-    for phrases in root.findall('.//phrases/phrase'):
-        phrase = ''
-        #for word in phrase.findall()
-        for words in phrases.findall('./words/word/item[@lang={{lang}}]'):
-            phrase = phrase + words.text + " "
-        corpus.append(phrase.strip(" "))
-    #export to a corpus file
-    print(corpus)
+#Run as python3 flextext_wordform_extractor.py input_files[any number]
+for i in range(1, len(sys.argv)):
+    tree = ET.parse(sys.argv[i])
+    print("working with file", i)
+    root = tree.getroot()
+    #save information about which language for specifying language
+    lang = [x.get("lang") for x in root.findall('.//languages/language/[@font="Charis SIL"]')][0]
+    lexd = open('%s.lexd' % lang, 'w')
+    morphlexd = open('morph_%s.lexd' % lang, 'w')
+    corpus = open('%s_corpus.txt' % lang, "w")
+    #Loop through the words in each phrase
+    for word in root.findall('.//phrases/phrase/words/word'):
+        for item in word.findall('./item'):
+            #for wordform in target language, get text
+            if item.get("type") == 'txt' and item.get("lang") == lang:
+                surface = item.text
+            #extract any available pos information about the wordform
+            if word.findall('./item[@type="pos"]') != []:
+                if item.get("type") == 'pos':
+                    pos = item.text
+                    word_by_pos[surface] = pos
+            #or else assign miscellaneous POS
+            else:
+                word_by_pos[surface] = "misc"
+pos_tags = list(set(word_by_pos.values()))
+#temporarily default patterns to part of speech until further steps in the project
+lexd.write("PATTERNS\n")
+[lexd.write(pos_tags[x].replace(".", "").replace(" ", "") + "\n") for x in range(len(pos_tags))]
+for pos_tag in pos_tags:
+    lexd.write("\nLEXICON " + pos_tag.replace(".", "").replace(" ", "") + "\n")
+    for entry in word_by_pos.items():
+        if pos_tag in entry:
+            lexd.write("<" + pos_tag.replace(".", "").replace(" ", "") + ">:" + entry[0] + "\n")
+    lexd.write("\n")
+#this loop extracts all phrases to a corpus file
+for phrases in root.findall('.//phrases/phrase'):
+    iso_path = './words/word/item/[@lang="' + lang + '"]'
+    phrase = ''
+    #for word in phrase.findall()
+    for words in phrases.findall(iso_path):
+        phrase = phrase + words.text + " "
+    corpus.write(phrase.strip(" ") +"\n")
+corpus.close()