public static void ConsolidateRefTabsep(string inputFile, string outputFile, double threshold) { NumberFormatInfo nfi = new NumberFormatInfo (); nfi.CurrencyDecimalSeparator = "."; nfi.NumberDecimalSeparator = "."; nfi.PercentDecimalSeparator = "."; StreamReader sr = new StreamReader (inputFile, Encoding.UTF8); StreamWriter sw = new StreamWriter (outputFile, false, new UTF8Encoding (false)); sw.NewLine = "\n"; Dictionary<string,Dictionary<string,int>> added = new Dictionary<string, Dictionary<string, int>> (); // SRC -> TRG -> ElemList Dictionary<string,Dictionary<string,List<ConsolidationElement>>> lemmaDict = new Dictionary<string, Dictionary<string, List<ConsolidationElement>>> (); char[] sep = {'\t'}; while (!sr.EndOfStream) { string line = sr.ReadLine (); if (!string.IsNullOrWhiteSpace (line)) { string[] dataArr = line.Split (sep, StringSplitOptions.None); if (dataArr.Length == 20) { double prob = Convert.ToDouble (dataArr [6], nfi); if (prob >= threshold-0.1) { string srcKey = dataArr [1] + dataArr [7] + dataArr [8]; string trgKey = dataArr [3] + dataArr [12] + dataArr [13]; string srcLemmaKey = System.Net.WebUtility.HtmlDecode(dataArr [8]); string trgLemmaKey = System.Net.WebUtility.HtmlDecode(dataArr [13]); if (srcLemmaKey.Length<3||trgLemmaKey.Length<3) continue; if (!IsValidLemma(srcLemmaKey)||!IsValidLemma(trgLemmaKey)) continue; if (!added.ContainsKey (srcKey)) { added.Add (srcKey, new Dictionary<string, int> ()); } if (!added [srcKey].ContainsKey (trgKey)) { added [srcKey].Add (trgKey, 1); //sw.WriteLine (line); if (!lemmaDict.ContainsKey (srcLemmaKey)) { lemmaDict.Add (srcLemmaKey, new Dictionary<string, List<ConsolidationElement>> ()); } if (!lemmaDict [srcLemmaKey].ContainsKey (trgLemmaKey)) { lemmaDict [srcLemmaKey].Add (trgLemmaKey, new List<ConsolidationElement> ()); } ConsolidationElement ce = new ConsolidationElement (); ce.line = line; ce.prob = prob; lemmaDict [srcLemmaKey] [trgLemmaKey].Add (ce); } else { added [srcKey] [trgKey]++; } } } } } List<string> srcLemmas = new List<string> (lemmaDict.Keys); srcLemmas.Sort (); foreach (string srcLemma in srcLemmas) { double max = Double.MinValue; Dictionary<string,double> avgProbDict = new Dictionary<string, double>(); List<string> trgLemmas = new List<string>(); foreach(string trgLemma in lemmaDict[srcLemma].Keys) { trgLemmas.Add(trgLemma); double sum=0; double count=0; foreach(ConsolidationElement ce in lemmaDict[srcLemma][trgLemma]) { sum+=ce.prob; count++; } double score = 0; if (count>0) { score = sum/count; if (score>max) max = score; } avgProbDict.Add(trgLemma,score); } trgLemmas.Sort(); double minThr = max - 0.05; foreach(string trgLemma in trgLemmas) { if (avgProbDict[trgLemma]>=minThr&&avgProbDict[trgLemma]>=threshold) { List<string> lines = new List<string>(); foreach(ConsolidationElement ce in lemmaDict[srcLemma][trgLemma]) { lines.Add(ce.line); } lines.Sort(); foreach(string line in lines) { sw.WriteLine(System.Net.WebUtility.HtmlDecode(line)); } } } } sw.Close (); sr.Close (); }
public static void ConsolidateRefTabsep(string inputFile, string outputFile, double threshold) { NumberFormatInfo nfi = new NumberFormatInfo(); nfi.CurrencyDecimalSeparator = "."; nfi.NumberDecimalSeparator = "."; nfi.PercentDecimalSeparator = "."; StreamReader sr = new StreamReader(inputFile, Encoding.UTF8); StreamWriter sw = new StreamWriter(outputFile, false, new UTF8Encoding(false)); sw.NewLine = "\n"; Dictionary <string, Dictionary <string, int> > added = new Dictionary <string, Dictionary <string, int> > (); // SRC -> TRG -> ElemList Dictionary <string, Dictionary <string, List <ConsolidationElement> > > lemmaDict = new Dictionary <string, Dictionary <string, List <ConsolidationElement> > > (); char[] sep = { '\t' }; while (!sr.EndOfStream) { string line = sr.ReadLine(); if (!string.IsNullOrWhiteSpace(line)) { string[] dataArr = line.Split(sep, StringSplitOptions.None); if (dataArr.Length == 20) { double prob = Convert.ToDouble(dataArr [6], nfi); if (prob >= threshold - 0.1) { string srcKey = dataArr [1] + dataArr [7] + dataArr [8]; string trgKey = dataArr [3] + dataArr [12] + dataArr [13]; string srcLemmaKey = System.Net.WebUtility.HtmlDecode(dataArr [8]); string trgLemmaKey = System.Net.WebUtility.HtmlDecode(dataArr [13]); if (srcLemmaKey.Length < 3 || trgLemmaKey.Length < 3) { continue; } if (!IsValidLemma(srcLemmaKey) || !IsValidLemma(trgLemmaKey)) { continue; } if (!added.ContainsKey(srcKey)) { added.Add(srcKey, new Dictionary <string, int> ()); } if (!added [srcKey].ContainsKey(trgKey)) { added [srcKey].Add(trgKey, 1); //sw.WriteLine (line); if (!lemmaDict.ContainsKey(srcLemmaKey)) { lemmaDict.Add(srcLemmaKey, new Dictionary <string, List <ConsolidationElement> > ()); } if (!lemmaDict [srcLemmaKey].ContainsKey(trgLemmaKey)) { lemmaDict [srcLemmaKey].Add(trgLemmaKey, new List <ConsolidationElement> ()); } ConsolidationElement ce = new ConsolidationElement(); ce.line = line; ce.prob = prob; lemmaDict [srcLemmaKey] [trgLemmaKey].Add(ce); } else { added [srcKey] [trgKey]++; } } } } } List <string> srcLemmas = new List <string> (lemmaDict.Keys); srcLemmas.Sort(); foreach (string srcLemma in srcLemmas) { double max = Double.MinValue; Dictionary <string, double> avgProbDict = new Dictionary <string, double>(); List <string> trgLemmas = new List <string>(); foreach (string trgLemma in lemmaDict[srcLemma].Keys) { trgLemmas.Add(trgLemma); double sum = 0; double count = 0; foreach (ConsolidationElement ce in lemmaDict[srcLemma][trgLemma]) { sum += ce.prob; count++; } double score = 0; if (count > 0) { score = sum / count; if (score > max) { max = score; } } avgProbDict.Add(trgLemma, score); } trgLemmas.Sort(); double minThr = max - 0.05; foreach (string trgLemma in trgLemmas) { if (avgProbDict[trgLemma] >= minThr && avgProbDict[trgLemma] >= threshold) { List <string> lines = new List <string>(); foreach (ConsolidationElement ce in lemmaDict[srcLemma][trgLemma]) { lines.Add(ce.line); } lines.Sort(); foreach (string line in lines) { sw.WriteLine(System.Net.WebUtility.HtmlDecode(line)); } } } } sw.Close(); sr.Close(); }