private static void readNamedEntities(string file, ref List <Entity> nes, bool additionalAnnotation) { string text = DataStructReader.readWholeTextFile(file, Encoding.UTF8); Regex regex = new Regex( "<.+?MEX TYPE=\"(?<type>.+?)\">(?<occ>.+?)</.+?MEX>", RegexOptions.Singleline ); Match m = regex.Match(text); while (m.Success) { Entity ent = new Entity(); ent.type = m.Groups["type"].Value; if (!additionalAnnotation) { ent.occurence = m.Groups["occ"].Value; } else { ent.occurence = stripAnnotation(m.Groups["occ"].Value); } nes.Add(ent); m = m.NextMatch(); } }
private static void getUserCounts(ref Dictionary <string, double> counts, ref Dictionary <string, Dictionary <string, int> > singleOccurences, string fileName, StreamWriter wrt, ref double total, string lang, bool alreadyProcessed) { string text = DataStructReader.readWholeTextFile(fileName, Encoding.UTF8); if (!alreadyProcessed) { string[] parts = text.Split('\n'); for (int i = 0; i < parts.Length; i++) { string[] xmlPieces = null; xmlPieces = preprocess(parts[i], lang).Trim().Split('\n'); foreach (string xmlText in xmlPieces) { process(xmlText, ref counts, ref singleOccurences, ref total, wrt); } Console.Write("\r{0}% ", 100 * (i + 1) / parts.Length); } } else { Regex regex = new Regex( "<seg lang=\"" + lang + "\">.+?</seg>", RegexOptions.Singleline ); Match m = regex.Match(text); while (m.Success) { process(m.Value, ref counts, ref singleOccurences, ref total, wrt); m = m.NextMatch(); } } }
private static void readTerminology(string file, ref List <string> nes, bool additionalAnnotation) { string text = DataStructReader.readWholeTextFile(file, Encoding.UTF8); Dictionary <string, bool> unique = new Dictionary <string, bool>(); Regex regex = new Regex( "<TENAME>(?<occ>.+?)</TENAME>", RegexOptions.Singleline ); Match m = regex.Match(text); while (m.Success) { string occ = m.Groups["occ"].Value.ToLower(); if (additionalAnnotation) { occ = stripAnnotation(occ); } if (!unique.ContainsKey(occ)) { nes.Add(occ); unique.Add(occ, true); } m = m.NextMatch(); } }
private static void terminology(string languageModelFile, string inputFile, bool keepIntermedyaryFiles, string lang, bool alreadyProcessed) { List <string> terms = new List <string>(); //HashSet<string> generalTerms = new HashSet<string>(); //if (lang == "ro") //{ // generalTerms = DataStructReader.readHashSet("gtRo.txt", Encoding.UTF8, 0, '\t', true, null); //} //else if (lang == "en") //{ // generalTerms = DataStructReader.readHashSet("gtEn.txt", Encoding.UTF8, 0, '\t', true, null); //} Dictionary <string, double> ncounts = DataStructReader.readDictionaryD(languageModelFile, Encoding.UTF8, 0, 1, '\t', false, null, null); if (ncounts.Count == 0) { Console.WriteLine("Language Model Missing... Press key for aborting!"); Console.ReadLine(); } else { Dictionary <string, double> userCounts = new Dictionary <string, double>(); double total = 0; if (!File.Exists(inputFile)) { Console.WriteLine("Input File doesn't exist... Press key for aborting!"); Console.ReadLine(); } else { Dictionary <string, string> fileCorrespondences = new Dictionary <string, string>(); string line = ""; StreamReader rdr = new StreamReader(inputFile, Encoding.UTF8); while ((line = rdr.ReadLine()) != null) { string[] parts = line.Trim().Split('\t'); if (!fileCorrespondences.ContainsKey(parts[0])) { fileCorrespondences.Add(parts[0], parts[1]); } } string[] files = fileCorrespondences.Keys.ToArray(); Dictionary <string, Dictionary <string, int> > singleOccurencesFirst = new Dictionary <string, Dictionary <string, int> >(); StreamWriter wrtProcessed = new StreamWriter("_preprocessed", false, Encoding.UTF8); wrtProcessed.AutoFlush = true; foreach (string file in files) { if (alreadyProcessed) { Console.Write("\nReading file: {0}", file); } else { Console.WriteLine("\nProcessing file: {0}", file); } getUserCounts(ref userCounts, ref singleOccurencesFirst, file, wrtProcessed, ref total, lang, alreadyProcessed); //Console.WriteLine(" ... done!"); } wrtProcessed.Close(); Console.Write("Extracting single word terms"); foreach (string key in userCounts.Keys.ToArray()) { if (userCounts[key] < 2 /*|| generalTerms.Contains(key)*/) { userCounts.Remove(key); } else { userCounts[key] = userCounts[key] / total; } } Dictionary <string, List <string> > singleOccurences = getSingle(singleOccurencesFirst); Dictionary <string, double> results = new Dictionary <string, double>(); foreach (string word in userCounts.Keys) { double newVal = 0; if (ncounts.ContainsKey(word)) { newVal = userCounts[word] / ncounts[word]; } else { newVal = userCounts[word] / ncounts["_dummy_"]; } results.Add(word, newVal); } string[] keys = results.Keys.ToArray(); double[] values = results.Values.ToArray(); Array.Sort(values, keys); StreamWriter wrt = new StreamWriter("_monoTerms", false, Encoding.UTF8); wrt.AutoFlush = true; for (int i = keys.Length - 1; i >= 0; i--) { wrt.WriteLine("{0}\t{1}", keys[i], values[i]); } wrt.Close(); Console.WriteLine(" ... done!"); Console.Write("Extracting multi word terms"); ColocationExtractor ce = new ColocationExtractor(); Dictionary <string, List <string> > multiOccurences = new Dictionary <string, List <string> >(); if (ce.extractCollocations("_preprocessed", "_multiTerms")) { Console.WriteLine(" ... done!"); Console.Write("Create index for extracting exact occurences"); if (!Directory.Exists("_index")) { Directory.CreateDirectory("_index"); } ce.indexText("_preprocessed", "_index"); Console.WriteLine(" ... done!"); Console.Write("Search for exact occurences"); Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher("_index"); multiOccurences = retriveOccurences(ce, searcher, "_multiTerms"); Console.WriteLine(" ... done!"); searcher.Close(); string[] filesToDel = Directory.GetFiles("_index"); foreach (string f in filesToDel) { File.Delete(f); } Directory.Delete("_index"); } else { Console.WriteLine(" ... done! - no multi word terms found!"); } Console.Write("Retrieving terminology"); terms = extractTerminology("_monoTerms", "_multiTerms", singleOccurences, multiOccurences); if (keepIntermedyaryFiles) { StreamWriter wrtT = new StreamWriter("_terminology", false, Encoding.UTF8); wrtT.AutoFlush = true; foreach (string term in terms) { wrtT.WriteLine(term); } wrtT.Close(); } Console.WriteLine(" ... done!"); HashSet <string> mono = new HashSet <string>(); Dictionary <string, HashSet <string> > multi = new Dictionary <string, HashSet <string> >(); HashSet <string> multiOrg = new HashSet <string>(); getTerms(terms, ref mono, ref multi, ref multiOrg); markTerms(lang, fileCorrespondences, mono, multi, multiOrg, alreadyProcessed); if (!keepIntermedyaryFiles) { File.Delete("_preprocessed"); File.Delete("_monoTerms"); File.Delete("_multiTerms"); } } } }
private static void markTerms(string lang, Dictionary <string, string> fileCorrespondences, HashSet <string> mono, Dictionary <string, HashSet <string> > multi, HashSet <string> multiOrg, bool alreadyProcessed) { foreach (string file in fileCorrespondences.Keys) { Console.Write("Annotating {0}... ", Path.GetFileName(file)); StreamWriter wrt = new StreamWriter(fileCorrespondences[file], false, Encoding.UTF8); wrt.AutoFlush = true; if (!alreadyProcessed) { //textul NU e deja preprocesat StreamReader rdr = new StreamReader(file, Encoding.UTF8); string line = ""; Regex regex = new Regex("(?<word>[\\w-]+)|(?<char>.)", RegexOptions.None); StringBuilder sb = new StringBuilder(); bool inside = false; while ((line = rdr.ReadLine()) != null) { Match m = regex.Match(line); while (m.Success) { string val = m.Groups["word"].Value; if (val != "" && (mono.Contains(val.ToLower()) || multi.ContainsKey(val.ToLower()) || inside)) { if (!inside) { if (!multi.ContainsKey(val.ToLower())) { wrt.Write("<TENAME>" + val + "</TENAME>"); } else { sb.Append("<TENAME>" + val); inside = true; } } else { string key = sb.ToString().Substring(9).ToLower().Trim(); if (multi.ContainsKey(key) && multi[key].Contains(val.ToLower())) { sb.Append(val); } else { wrt.Write(sb.ToString().Trim() + "</TENAME>"); wrt.Write(" " + val); inside = false; sb = new StringBuilder(); } } } else { if (!inside) { wrt.Write(m.Value); } else { if (m.Value != " ") { wrt.Write(sb.ToString() + "</TENAME>"); wrt.Write(m.Value); inside = false; sb = new StringBuilder(); } else { sb.Append(" "); } } } m = m.NextMatch(); } wrt.WriteLine(); } rdr.Close(); } else { //textul e deja preprocesat string text = DataStructReader.readWholeTextFile(file, Encoding.UTF8); Regex regex = new Regex( "<seg lang=\"" + lang + "\">.+?</seg>", RegexOptions.Singleline ); Match m = regex.Match(text); while (m.Success) { try { XmlDocument xdoc = new XmlDocument(); xdoc.LoadXml("<!DOCTYPE root [<!ENTITY % SGMLUniq SYSTEM \"sgmlunic.ent\"> %SGMLUniq;]>\n<root>" + m.Value.Replace("", "").Replace("\x01", "").Replace("\x1B", "").Replace("&b.theta;", "&b.Theta;") + "</root>"); XmlNodeList list = xdoc.SelectNodes("//w|//c"); StringBuilder sb = new StringBuilder(); bool inside = false; StringBuilder sentence = new StringBuilder(); string firstPos = ""; foreach (XmlNode node in list) { bool alreadyAdded = false; if (node.Name == "w") { string val = node.InnerText.Replace("_", " "); string pos = node.Attributes["ana"].InnerText.Substring(0, 1).ToLower(); if (val != "" && (mono.Contains(val.ToLower()) || multi.ContainsKey(val.ToLower()) || inside)) { if (!inside) { if (pos == "n" || pos == "a") { if (!multi.ContainsKey(val.ToLower())) { if (pos == "n") { sentence.Append(" <TENAME>" + val + "</TENAME>"); } } else { sb.Append(" <TENAME>" + val); firstPos = pos; inside = true; } } else { sentence.Append(" " + val); } } else { string key = sb.ToString().Trim().Substring(8).ToLower().Trim(); if (multi.ContainsKey(key) && multi[key].Contains(val.ToLower())) { sb.Append(" " + val); } else { string toAdd = sb.ToString().Trim().Substring(8).Trim(); if (multiOrg.Contains(toAdd.ToLower())) { sentence.Append(" <TENAME>" + toAdd + "</TENAME>"); } else { int idx = toAdd.IndexOf(' '); if (idx != -1) { string first = toAdd.Substring(0, idx); string rest = toAdd.Substring(idx + 1); if (mono.Contains(first.ToLower()) && firstPos == "n") { sentence.Append(" <TENAME>" + first + "</TENAME> " + rest); } else { sentence.Append(" " + toAdd); } } else if (firstPos == "n") { sentence.Append(" <TENAME>" + toAdd + "</TENAME>"); } else { sentence.Append(" " + toAdd); } } sentence.Append(" " + val); inside = false; sb = new StringBuilder(); } } alreadyAdded = true; } } if (!inside) { if (!alreadyAdded) { sentence.Append(" " + node.InnerText.Replace("_", " ")); } } else { if (!alreadyAdded) { if (m.Value != " ") { sentence.Append(" " + sb.ToString().Trim() + "</TENAME>"); sentence.Append(" " + node.InnerText.Replace("_", " ")); inside = false; sb = new StringBuilder(); } else { sb.Append(" "); } } } } wrt.WriteLine(sentence.ToString().Trim()); } catch { } m = m.NextMatch(); } } wrt.Close(); Console.WriteLine("done."); } }