commit 985b008c248999733436f762f388e196a58adef5 Author: Tanmai Khanna Date: Tue Jun 18 01:42:12 2019 +0530 Anaphora module coded in C++ (slight bug to fix) diff --git a/src/anaphora.cc b/src/anaphora.cc index 96439c1..d42edf3 100644 --- a/src/anaphora.cc +++ b/src/anaphora.cc @@ -1,43 +1,134 @@ +#include "parse_biltrans.h" + #include #include #include +#include -int main() +using namespace std; + +void print(vector const &input) { - char input_char; + for (int i = 0; i < input.size(); i++) + { + fprintf(stdout, "%c", input.at(i)); + } +} - input_char = fgetc(stdin); +int contains(vector< vector > tags, vector tag) +{ + if(std::find(tags.begin(), tags.end(), tag) != tags.end()) + return 1; + else + return 0; +} - char input_stream[100] = ""; - char output_stream[100] = ""; +vector string_to_vector(char *string_in) +{ + vector temp; - int flag_LU = 0; + for(int i = 0; i < strlen(string_in); i++) + temp.push_back(string_in[i]); - while(input_char!=EOF) - { - fprintf(stdout, "%c",input_char); + return temp; +} - if(input_char == '^') - flag_LU = 1; +int main() +{ + char input_char; - if(flag_LU == 1) //Part of an LU - strcat(input_stream, string(input_char)) + input_char = fgetc(stdin); - if(flag_LU == 0) //Not Part of an LU - fprintf(stdout, "%c", input_char); + vector input_stream; + vector last_noun; - if(input_char == '$') - { - flag_LU = 0; + vector temp_form; + vector< vector > temp_tags; - LU = parse(input_stream); + vector antecedent_tag = string_to_vector("n"); - } + vector r1_tag1 = string_to_vector("det"); + vector r1_tag2 = string_to_vector("pos"); + vector r2_tag1 = string_to_vector("prn"); + int flag_LU = 0; + while(input_char!=EOF) + { - input_char = fgetc(stdin); + if(input_char == '\\') //dealing with escaped characters + { + if(flag_LU == 0) // not inside LU + { + fprintf(stdout, "%c", input_char); + + input_char = fgetc(stdin); + + fprintf(stdout, "%c", input_char); + } + else //inside LU + { + input_stream.push_back(input_char); + fprintf(stdout, "%c", input_char); + + input_char = fgetc(stdin); + + fprintf(stdout, "%c", input_char); + input_stream.push_back(input_char); + } + } + else + { + if(flag_LU == 0) //Not Part of an LU + { + fprintf(stdout, "%c", input_char); + + if(input_char == '^') + flag_LU = 1; + } + + else if(flag_LU == 1) //Part of an LU + { + if(input_char == '$') + { + fprintf(stdout, "/"); //for extra LU + + flag_LU = 0; + LexicalUnit LU(input_stream); //Parse Lexical Unit using parse_biltrans + + temp_form = LU.get_tl_form(); + temp_tags = LU.get_tl_tags(); + + if(!temp_form.empty()) //if TL exists + { + if(contains(temp_tags, antecedent_tag)) + /* if TL contains antecedent tag */ + { + last_noun = temp_form; + } + + if((contains(temp_tags, r1_tag1) && contains(temp_tags, r1_tag2)) || contains(temp_tags, r2_tag1)) + /* if TL tags has det and pos OR just prn*/ + { + print(last_noun); //add last seen noun to LU + } + } + + input_stream.clear(); + + } + else + { + input_stream.push_back(input_char); + } + + fprintf(stdout, "%c", input_char); + + } + + input_char = fgetc(stdin); + } } //fclose(fin); diff --git a/src/parse_biltrans.cc b/src/parse_biltrans.cc index 07ab1fe..5b77d3d 100644 --- a/src/parse_biltrans.cc +++ b/src/parse_biltrans.cc @@ -1,140 +1,129 @@ +#include "parse_biltrans.h" + #include #include using namespace std; -class LexicalUnit +LexicalUnit::LexicalUnit(vector input_LU) { + int seenSlash = 0; + int seenTag = 0; -private: - - vector sl_form; - vector tl_form; - - vector< vector > sl_tags; - vector< vector > tl_tags; + vector temptag; -public: - - LexicalUnit(vector input_LU) + for (auto i = input_LU.begin(); i != input_LU.end(); ++i) { - int seenSlash = 0; - int seenTag = 0; - - vector temptag; - - for (auto i = input_LU.begin(); i != input_LU.end(); ++i) - { - if(*i == '\\') //dealing with escaped characters + if(*i == '\\') //dealing with escaped characters + { + if(seenSlash == 0) //sl { - if(seenSlash == 0) //sl - { - if(seenTag == 1) //in a tag - { - temptag.push_back(*i); - sl_form.push_back(*i); - ++i; - temptag.push_back(*i); - sl_form.push_back(*i); - } - else //not in a tag - { - sl_form.push_back(*i); - ++i; - sl_form.push_back(*i); - } + if(seenTag == 1) //in a tag + { + temptag.push_back(*i); + sl_form.push_back(*i); + ++i; + temptag.push_back(*i); + sl_form.push_back(*i); + } + else //not in a tag + { + sl_form.push_back(*i); + ++i; + sl_form.push_back(*i); + } + } + else //tl + { + if(seenTag == 1) //in a tag + { + temptag.push_back(*i); + tl_form.push_back(*i); + ++i; + temptag.push_back(*i); + tl_form.push_back(*i); } - else //tl + else //not in a tag { - if(seenTag == 1) //in a tag - { - temptag.push_back(*i); - tl_form.push_back(*i); - ++i; - temptag.push_back(*i); - tl_form.push_back(*i); - } - else //not in a tag - { - tl_form.push_back(*i); - ++i; - tl_form.push_back(*i); - } + tl_form.push_back(*i); + ++i; + tl_form.push_back(*i); } } + } - else if(*i == '/') - seenSlash++; + else if(*i == '/') + seenSlash++; - else if(seenSlash == 0) //sl - { - sl_form.push_back(*i); //add to the sl form + else if(seenSlash == 0) //sl + { + sl_form.push_back(*i); //add to the sl form - if(*i == '<') //start reading tag - seenTag++; + if(*i == '<') //start reading tag + seenTag++; - else if(seenTag == 1) //inside a tag + else if(seenTag == 1) //inside a tag + { + if(*i == '>') //if tag ends + { + seenTag--; + sl_tags.push_back(temptag); //add tag to list of sl tags + + temptag.clear(); + } + else { - if(*i == '>') //if tag ends - { - seenTag--; - sl_tags.push_back(temptag); //add tag to list of sl tags - - temptag.clear(); - } - else - { - temptag.push_back(*i); //add char to current tag - } + temptag.push_back(*i); //add char to current tag } } + } - else //tl - { - tl_form.push_back(*i); //add to the tl form + else //tl + { + tl_form.push_back(*i); //add to the tl form + + if(*i == '<') //start reading tag + seenTag++; - if(*i == '<') //start reading tag - seenTag++; + else if(seenTag == 1) //inside a tag + { + if(*i == '>') //if tag ends + { + seenTag--; + tl_tags.push_back(temptag); //add tag to list of tl tags - else if(seenTag == 1) //inside a tag + temptag.clear(); + } + else { - if(*i == '>') //if tag ends - { - seenTag--; - tl_tags.push_back(temptag); //add tag to list of tl tags - - temptag.clear(); - } - else - { - temptag.push_back(*i); //add char to current tag - } + temptag.push_back(*i); //add char to current tag } } } } +} - vector get_sl_form() - { - return sl_form; - } +vector LexicalUnit::get_sl_form() +{ + return sl_form; +} - vector get_tl_form() - { - return tl_form; - } +vector LexicalUnit::get_tl_form() +{ + return tl_form; +} - vector< vector > get_sl_tags() - { - return sl_tags; - } +vector< vector > LexicalUnit::get_sl_tags() +{ + return sl_tags; +} - vector< vector > get_tl_tags() - { - return tl_tags; - } +vector< vector > LexicalUnit::get_tl_tags() +{ + return tl_tags; +} -}; +/* Uncomment to test this code void print(vector const &input) { @@ -182,5 +171,7 @@ int main() return 0; } +*/ + diff --git a/src/parse_biltrans.h b/src/parse_biltrans.h new file mode 100644 index 0000000..87351fd --- /dev/null +++ b/src/parse_biltrans.h @@ -0,0 +1,64 @@ +#ifndef _PARSEBILTRANS_ +#define _PARSEBILTRANS_ + +#include + +using namespace std; + +/** + * Parsing Lexical Unit from biltrans for the Anaphora Module + */ +class LexicalUnit +{ + +private: + /** + * Source language word and tags + */ + vector sl_form; + + /** + * Target language word and tags + */ + vector tl_form; + + /** + * Source language tags + */ + vector< vector > sl_tags; + + /** + * Target language tags + */ + vector< vector > tl_tags; + +public: + /** + * Constructor to fill all variables + * @param input_LU one lexical unit between ^ and $ (excluded) + */ + LexicalUnit(vector input_LU); + + /** + * Return the Source Language Form + */ + vector get_sl_form(); + + /** + * Return the Target Language Form + */ + vector get_tl_form(); + + /** + * Return the Source Language Tags + */ + vector< vector > get_sl_tags(); + + /** + * Return the Target Language Form + */ + vector< vector > get_tl_tags(); + +}; + +#endif \ No newline at end of file