public void generate_terms_for_concepts() { text_analysis ta_obj = new text_analysis(); XmlDocument xmlDoc = new XmlDocument(); xmlDoc.Load(concepts_path); XmlElement root = xmlDoc.DocumentElement; XmlNodeList synonyms_elem = root.GetElementsByTagName("s"); string term, temp_term; foreach (XmlElement synElem in synonyms_elem) { /* * Certian things u have to take care of: * 1. if the synonym is a s topword * 2. (xxxx) the disambiguation prantheses * 3. the form xxxx, xxxx * 4. the synonym could have multiple prantheses * 5. normalization of synoym without removing stopwords * 6. synoyms which are acronyms i.e. U.S.A * 7. ãÓÃáÉ Çá ãÇÁ¡ ÛÇÒ¡ ßåÑÈÇÁ * 8. 
 */ term = synElem.ChildNodes[0].Value; if (term == "ÇáÌãåæÑíÉ \r\nÇáÊæäÓíÉ") { } temp_term = Regex.Replace(term, @"\(.*?\)", "").Trim(); if (!string.IsNullOrEmpty(temp_term)) { term = temp_term; } temp_term = Regex.Replace(term, @"¡.*", "").Trim(); if (!string.IsNullOrEmpty(temp_term)) { term = temp_term; } ta_obj.Analysis_type = 1; temp_term = ta_obj.text_processing(term, false).Trim(); if (!string.IsNullOrEmpty(temp_term)) { term = temp_term; } synElem.SetAttribute("term", term); } xmlDoc.Save(concepts_path = Path.GetDirectoryName(concepts_path) + @"\concepts_normalized_no_sw_removal.xml"); }
public void process_corpus(string target_name) { string destination_path = Path.GetDirectoryName(corpus_path) + "\\" + target_name; if (File.Exists(destination_path)) { destination_path = generate_new_name(Path.GetDirectoryName(corpus_path), target_name); } string[] sep = { "\r\n" }; text_analysis ta_obj = new text_analysis(); ta_obj.Analysis_type = 3; using (StreamWriter sw = new StreamWriter(destination_path, false, corpus_encoding)) { string document; string word; if (get_first_DOC(out document)) { do { List <string> tokens = new List <string>(document.Split(sep, StringSplitOptions.RemoveEmptyEntries)); if (tokens[0].StartsWith("<DOC") && tokens[tokens.Count - 1] == "</DOC>") { sw.WriteLine(tokens[0]); tokens.RemoveAt(tokens.Count - 1); tokens.RemoveAt(0); foreach (string token in tokens) { word = ta_obj.text_processing(token, true); if (!string.IsNullOrEmpty(word)) { sw.WriteLine(word); } } sw.WriteLine("</DOC>"); } else { throw new Exception(); } } while (get_next_DOC(out document)); } } }