예제 #1
0
        private static Dictionary <string, List <string> > retriveOccurences(ColocationExtractor ce, Lucene.Net.Search.IndexSearcher searcher, string fileName)
        {
            Dictionary <string, List <string> > ret = new Dictionary <string, List <string> >();
            StreamReader rdr  = new StreamReader(fileName, Encoding.UTF8);
            string       line = "";

            while ((line = rdr.ReadLine()) != null)
            {
                line = line.Trim();
                string[]      tokens = line.Split(' ');
                List <string> str    = ce.retrieveExpression(searcher, line);
                ret.Add(tokens[0] + " " + tokens[1], str);
            }
            rdr.Close();
            return(ret);
        }
예제 #2
0
        private static void terminology(string languageModelFile, string inputFile, bool keepIntermedyaryFiles, string lang, bool alreadyProcessed)
        {
            List <string> terms = new List <string>();

            //HashSet<string> generalTerms = new HashSet<string>();
            //if (lang == "ro")
            //{
            //    generalTerms = DataStructReader.readHashSet("gtRo.txt", Encoding.UTF8, 0, '\t', true, null);
            //}
            //else if (lang == "en")
            //{
            //    generalTerms = DataStructReader.readHashSet("gtEn.txt", Encoding.UTF8, 0, '\t', true, null);
            //}

            Dictionary <string, double> ncounts = DataStructReader.readDictionaryD(languageModelFile, Encoding.UTF8, 0, 1, '\t', false, null, null);

            if (ncounts.Count == 0)
            {
                Console.WriteLine("Language Model Missing... Press key for aborting!");
                Console.ReadLine();
            }
            else
            {
                Dictionary <string, double> userCounts = new Dictionary <string, double>();
                double total = 0;

                if (!File.Exists(inputFile))
                {
                    Console.WriteLine("Input File doesn't exist... Press key for aborting!");
                    Console.ReadLine();
                }
                else
                {
                    Dictionary <string, string> fileCorrespondences = new Dictionary <string, string>();
                    string       line = "";
                    StreamReader rdr  = new StreamReader(inputFile, Encoding.UTF8);
                    while ((line = rdr.ReadLine()) != null)
                    {
                        string[] parts = line.Trim().Split('\t');
                        if (!fileCorrespondences.ContainsKey(parts[0]))
                        {
                            fileCorrespondences.Add(parts[0], parts[1]);
                        }
                    }

                    string[] files = fileCorrespondences.Keys.ToArray();
                    Dictionary <string, Dictionary <string, int> > singleOccurencesFirst = new Dictionary <string, Dictionary <string, int> >();
                    StreamWriter wrtProcessed = new StreamWriter("_preprocessed", false, Encoding.UTF8);
                    wrtProcessed.AutoFlush = true;

                    foreach (string file in files)
                    {
                        if (alreadyProcessed)
                        {
                            Console.Write("\nReading file: {0}", file);
                        }
                        else
                        {
                            Console.WriteLine("\nProcessing file: {0}", file);
                        }
                        getUserCounts(ref userCounts, ref singleOccurencesFirst, file, wrtProcessed, ref total, lang, alreadyProcessed);
                        //Console.WriteLine(" ... done!");
                    }
                    wrtProcessed.Close();

                    Console.Write("Extracting single word terms");

                    foreach (string key in userCounts.Keys.ToArray())
                    {
                        if (userCounts[key] < 2 /*|| generalTerms.Contains(key)*/)
                        {
                            userCounts.Remove(key);
                        }
                        else
                        {
                            userCounts[key] = userCounts[key] / total;
                        }
                    }

                    Dictionary <string, List <string> > singleOccurences = getSingle(singleOccurencesFirst);

                    Dictionary <string, double> results = new Dictionary <string, double>();
                    foreach (string word in userCounts.Keys)
                    {
                        double newVal = 0;

                        if (ncounts.ContainsKey(word))
                        {
                            newVal = userCounts[word] / ncounts[word];
                        }
                        else
                        {
                            newVal = userCounts[word] / ncounts["_dummy_"];
                        }

                        results.Add(word, newVal);
                    }

                    string[] keys   = results.Keys.ToArray();
                    double[] values = results.Values.ToArray();

                    Array.Sort(values, keys);

                    StreamWriter wrt = new StreamWriter("_monoTerms", false, Encoding.UTF8);
                    wrt.AutoFlush = true;
                    for (int i = keys.Length - 1; i >= 0; i--)
                    {
                        wrt.WriteLine("{0}\t{1}", keys[i], values[i]);
                    }
                    wrt.Close();

                    Console.WriteLine(" ... done!");

                    Console.Write("Extracting multi word terms");

                    ColocationExtractor ce = new ColocationExtractor();
                    Dictionary <string, List <string> > multiOccurences = new Dictionary <string, List <string> >();

                    if (ce.extractCollocations("_preprocessed", "_multiTerms"))
                    {
                        Console.WriteLine(" ... done!");

                        Console.Write("Create index for extracting exact occurences");

                        if (!Directory.Exists("_index"))
                        {
                            Directory.CreateDirectory("_index");
                        }
                        ce.indexText("_preprocessed", "_index");
                        Console.WriteLine(" ... done!");

                        Console.Write("Search for exact occurences");
                        Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher("_index");
                        multiOccurences = retriveOccurences(ce, searcher, "_multiTerms");
                        Console.WriteLine(" ... done!");

                        searcher.Close();
                        string[] filesToDel = Directory.GetFiles("_index");
                        foreach (string f in filesToDel)
                        {
                            File.Delete(f);
                        }
                        Directory.Delete("_index");
                    }
                    else
                    {
                        Console.WriteLine(" ... done! - no multi word terms found!");
                    }


                    Console.Write("Retrieving terminology");

                    terms = extractTerminology("_monoTerms", "_multiTerms", singleOccurences, multiOccurences);

                    if (keepIntermedyaryFiles)
                    {
                        StreamWriter wrtT = new StreamWriter("_terminology", false, Encoding.UTF8);
                        wrtT.AutoFlush = true;
                        foreach (string term in terms)
                        {
                            wrtT.WriteLine(term);
                        }
                        wrtT.Close();
                    }

                    Console.WriteLine(" ... done!");

                    HashSet <string> mono = new HashSet <string>();
                    Dictionary <string, HashSet <string> > multi = new Dictionary <string, HashSet <string> >();
                    HashSet <string> multiOrg = new HashSet <string>();

                    getTerms(terms, ref mono, ref multi, ref multiOrg);
                    markTerms(lang, fileCorrespondences, mono, multi, multiOrg, alreadyProcessed);

                    if (!keepIntermedyaryFiles)
                    {
                        File.Delete("_preprocessed");
                        File.Delete("_monoTerms");
                        File.Delete("_multiTerms");
                    }
                }
            }
        }