예제 #1
0
        public void generate_terms_for_concepts()
        {
            text_analysis ta_obj = new text_analysis();
            XmlDocument   xmlDoc = new XmlDocument();

            xmlDoc.Load(concepts_path);
            XmlElement  root          = xmlDoc.DocumentElement;
            XmlNodeList synonyms_elem = root.GetElementsByTagName("s");

            string term, temp_term;

            foreach (XmlElement synElem in synonyms_elem)
            {
                /*
                 * Certian things u have to take care of:
                 * 1. if the synonym is a s topword
                 * 2. (xxxx) the disambiguation prantheses
                 * 3. the form xxxx, xxxx
                 * 4. the synonym could have multiple prantheses
                 * 5. normalization of synoym without removing stopwords
                 * 6. synoyms which are acronyms i.e. U.S.A
                 * 7. ãÓÃáÉ Çá ãÇÁ¡ ÛÇÒ¡ ßåÑÈÇÁ
                 * 8. 

                 */

                term = synElem.ChildNodes[0].Value;

                if (term == "ÇáÌãåæÑíÉ \r\nÇáÊæäÓíÉ")
                {
                }
                temp_term = Regex.Replace(term, @"\(.*?\)", "").Trim();

                if (!string.IsNullOrEmpty(temp_term))
                {
                    term = temp_term;
                }

                temp_term = Regex.Replace(term, @"¡.*", "").Trim();

                if (!string.IsNullOrEmpty(temp_term))
                {
                    term = temp_term;
                }

                ta_obj.Analysis_type = 1;
                temp_term            = ta_obj.text_processing(term, false).Trim();

                if (!string.IsNullOrEmpty(temp_term))
                {
                    term = temp_term;
                }

                synElem.SetAttribute("term", term);
            }
            xmlDoc.Save(concepts_path = Path.GetDirectoryName(concepts_path) + @"\concepts_normalized_no_sw_removal.xml");
        }
예제 #2
0
        public void process_corpus(string target_name)
        {
            string destination_path = Path.GetDirectoryName(corpus_path) + "\\" + target_name;

            if (File.Exists(destination_path))
            {
                destination_path = generate_new_name(Path.GetDirectoryName(corpus_path), target_name);
            }
            string[]      sep    = { "\r\n" };
            text_analysis ta_obj = new text_analysis();

            ta_obj.Analysis_type = 3;
            using (StreamWriter sw = new StreamWriter(destination_path, false, corpus_encoding))
            {
                string document;
                string word;
                if (get_first_DOC(out document))
                {
                    do
                    {
                        List <string> tokens = new List <string>(document.Split(sep, StringSplitOptions.RemoveEmptyEntries));

                        if (tokens[0].StartsWith("<DOC") && tokens[tokens.Count - 1] == "</DOC>")
                        {
                            sw.WriteLine(tokens[0]);
                            tokens.RemoveAt(tokens.Count - 1);
                            tokens.RemoveAt(0);
                            foreach (string token in tokens)
                            {
                                word = ta_obj.text_processing(token, true);

                                if (!string.IsNullOrEmpty(word))
                                {
                                    sw.WriteLine(word);
                                }
                            }
                            sw.WriteLine("</DOC>");
                        }
                        else
                        {
                            throw new Exception();
                        }
                    } while (get_next_DOC(out document));
                }
            }
        }