private static Dictionary <string, List <string> > retriveOccurences(ColocationExtractor ce, Lucene.Net.Search.IndexSearcher searcher, string fileName) { Dictionary <string, List <string> > ret = new Dictionary <string, List <string> >(); StreamReader rdr = new StreamReader(fileName, Encoding.UTF8); string line = ""; while ((line = rdr.ReadLine()) != null) { line = line.Trim(); string[] tokens = line.Split(' '); List <string> str = ce.retrieveExpression(searcher, line); ret.Add(tokens[0] + " " + tokens[1], str); } rdr.Close(); return(ret); }
private static void terminology(string languageModelFile, string inputFile, bool keepIntermedyaryFiles, string lang, bool alreadyProcessed) { List <string> terms = new List <string>(); //HashSet<string> generalTerms = new HashSet<string>(); //if (lang == "ro") //{ // generalTerms = DataStructReader.readHashSet("gtRo.txt", Encoding.UTF8, 0, '\t', true, null); //} //else if (lang == "en") //{ // generalTerms = DataStructReader.readHashSet("gtEn.txt", Encoding.UTF8, 0, '\t', true, null); //} Dictionary <string, double> ncounts = DataStructReader.readDictionaryD(languageModelFile, Encoding.UTF8, 0, 1, '\t', false, null, null); if (ncounts.Count == 0) { Console.WriteLine("Language Model Missing... Press key for aborting!"); Console.ReadLine(); } else { Dictionary <string, double> userCounts = new Dictionary <string, double>(); double total = 0; if (!File.Exists(inputFile)) { Console.WriteLine("Input File doesn't exist... Press key for aborting!"); Console.ReadLine(); } else { Dictionary <string, string> fileCorrespondences = new Dictionary <string, string>(); string line = ""; StreamReader rdr = new StreamReader(inputFile, Encoding.UTF8); while ((line = rdr.ReadLine()) != null) { string[] parts = line.Trim().Split('\t'); if (!fileCorrespondences.ContainsKey(parts[0])) { fileCorrespondences.Add(parts[0], parts[1]); } } string[] files = fileCorrespondences.Keys.ToArray(); Dictionary <string, Dictionary <string, int> > singleOccurencesFirst = new Dictionary <string, Dictionary <string, int> >(); StreamWriter wrtProcessed = new StreamWriter("_preprocessed", false, Encoding.UTF8); wrtProcessed.AutoFlush = true; foreach (string file in files) { if (alreadyProcessed) { Console.Write("\nReading file: {0}", file); } else { Console.WriteLine("\nProcessing file: {0}", file); } getUserCounts(ref userCounts, ref singleOccurencesFirst, file, wrtProcessed, ref total, lang, alreadyProcessed); //Console.WriteLine(" ... done!"); } wrtProcessed.Close(); Console.Write("Extracting single word terms"); foreach (string key in userCounts.Keys.ToArray()) { if (userCounts[key] < 2 /*|| generalTerms.Contains(key)*/) { userCounts.Remove(key); } else { userCounts[key] = userCounts[key] / total; } } Dictionary <string, List <string> > singleOccurences = getSingle(singleOccurencesFirst); Dictionary <string, double> results = new Dictionary <string, double>(); foreach (string word in userCounts.Keys) { double newVal = 0; if (ncounts.ContainsKey(word)) { newVal = userCounts[word] / ncounts[word]; } else { newVal = userCounts[word] / ncounts["_dummy_"]; } results.Add(word, newVal); } string[] keys = results.Keys.ToArray(); double[] values = results.Values.ToArray(); Array.Sort(values, keys); StreamWriter wrt = new StreamWriter("_monoTerms", false, Encoding.UTF8); wrt.AutoFlush = true; for (int i = keys.Length - 1; i >= 0; i--) { wrt.WriteLine("{0}\t{1}", keys[i], values[i]); } wrt.Close(); Console.WriteLine(" ... done!"); Console.Write("Extracting multi word terms"); ColocationExtractor ce = new ColocationExtractor(); Dictionary <string, List <string> > multiOccurences = new Dictionary <string, List <string> >(); if (ce.extractCollocations("_preprocessed", "_multiTerms")) { Console.WriteLine(" ... done!"); Console.Write("Create index for extracting exact occurences"); if (!Directory.Exists("_index")) { Directory.CreateDirectory("_index"); } ce.indexText("_preprocessed", "_index"); Console.WriteLine(" ... done!"); Console.Write("Search for exact occurences"); Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher("_index"); multiOccurences = retriveOccurences(ce, searcher, "_multiTerms"); Console.WriteLine(" ... done!"); searcher.Close(); string[] filesToDel = Directory.GetFiles("_index"); foreach (string f in filesToDel) { File.Delete(f); } Directory.Delete("_index"); } else { Console.WriteLine(" ... done! - no multi word terms found!"); } Console.Write("Retrieving terminology"); terms = extractTerminology("_monoTerms", "_multiTerms", singleOccurences, multiOccurences); if (keepIntermedyaryFiles) { StreamWriter wrtT = new StreamWriter("_terminology", false, Encoding.UTF8); wrtT.AutoFlush = true; foreach (string term in terms) { wrtT.WriteLine(term); } wrtT.Close(); } Console.WriteLine(" ... done!"); HashSet <string> mono = new HashSet <string>(); Dictionary <string, HashSet <string> > multi = new Dictionary <string, HashSet <string> >(); HashSet <string> multiOrg = new HashSet <string>(); getTerms(terms, ref mono, ref multi, ref multiOrg); markTerms(lang, fileCorrespondences, mono, multi, multiOrg, alreadyProcessed); if (!keepIntermedyaryFiles) { File.Delete("_preprocessed"); File.Delete("_monoTerms"); File.Delete("_multiTerms"); } } } }