public void genEEgraph(string reportPath, int kwLimitThreshold = 350, int entityOccurThreshold = 2, double polarityThresh = 0.35, bool PoleInvariant = false, bool NoSelfEdge = true) { if (EkwEgg == null) { EkwEgg = new Dictionary <string, EEVertex>(StringComparer.InvariantCultureIgnoreCase); } Console.WriteLine("Generating EE graph"); int legitKWCnt = 0; foreach (KeyValuePair <string, kwEo> cnt in kwLexicon) { if (cnt.Value.E.Count < kwLimitThreshold) { legitKWCnt++; } } Console.WriteLine("from " + legitKWCnt + " legit kw"); DateTime startTime = DateTime.Now; foreach (KeyValuePair <string, EkwO> ekwo in EkwDict) { string E = ekwo.Key; EkwO ekwp = ekwo.Value; double pScore = ekwp.PScore; if (ekwp.Occur <= entityOccurThreshold) { continue; } EkwEgg.Add(E, new EEVertex(E)); foreach (string keyword in ekwp.kw.Keys) { //got the linker if (kwLexicon.ContainsKey(keyword) && kwLexicon[keyword].E.Count < kwLimitThreshold) { Dictionary <string, Meta> PotentialNeighbors = kwLexicon[keyword].E; foreach (string nborE in PotentialNeighbors.Keys) { if (nborE == E && NoSelfEdge) { continue; } double nborPScore = EkwDict[nborE].PScore; polarity pole = polarity.opn; if (nborPScore >= polarityThresh && pScore >= polarityThresh) { pole = polarity.pos; } else if (nborPScore <= -polarityThresh && pScore <= -polarityThresh) { pole = polarity.neg; } else if (nborPScore > 0 && pScore > 0) { if (nborPScore + pScore > 2 * polarityThresh) { pole = polarity.pos; } else { pole = polarity.obj; } } else if (nborPScore < 0 && pScore < 0) { if (nborPScore + pScore > 2 * polarityThresh) { pole = polarity.neg; } else { pole = polarity.obj; } } else if (nborPScore == 0 || pScore == 0) { pole = polarity.obj; } if (pole != polarity.opn || PoleInvariant) { EkwEgg[E].AddNeighbor(nborE, keyword, pole); } //added nbor } //finding nbors by kw } //this kw is in lex } //iterating over kw } //iterating over EkwDict //cleanup foreach (string node in EkwEgg.Keys) { string[] nborvs = new string[EkwEgg[node].Neighbors.Keys.Count]; int i = 0; foreach (string nbk in EkwEgg[node].Neighbors.Keys) { nborvs[i++] = nbk; } foreach (string nbork in nborvs) { if (!EkwEgg.ContainsKey(nbork)) { EkwEgg[node].Neighbors.Remove(nbork); } } } Console.WriteLine("and " + EkwEgg.Count + " entities"); Console.WriteLine("EkwEG time to generate EE graph: " + (TimeSpan)(DateTime.Now - startTime)); StreamWriter fs = new StreamWriter(reportPath + "-" + kwLimitThreshold + "," + entityOccurThreshold + ".EkwEGg.csv", false); foreach (KeyValuePair <string, EEVertex> evert in EkwEgg) { fs.Write(evert.Key + "," + evert.Value.Neighbors.Count + ","); foreach (string nbor in evert.Value.Neighbors.Keys) { fs.Write(nbor + ";"); } fs.WriteLine(); } fs.Close(); Console.WriteLine("EkwEG time to store & generate EE graph: " + (TimeSpan)(DateTime.Now - startTime)); }
public void genEkwBigraph(string TEkwPpath, string reportPath = "a") { DateTime startTime = DateTime.Now; XmlDocument xDoc = new XmlDocument(); xDoc.Load(TEkwPpath); xDoc.Normalize(); XmlNode root = xDoc.DocumentElement; StreamWriter fse = new StreamWriter(reportPath + ".E.csv", false); StreamWriter fsk = new StreamWriter(reportPath + ".kw.csv", false); XmlNodeList TEkwPNodeList = root.SelectNodes("//TEkwP"); Console.WriteLine("E-kw"); Console.WriteLine("echo: building E-kw bigraph with E-kw matrix and kw-E matrix + kwLexicon"); int tekwp_i = 0; foreach (XmlNode tekwpNode in TEkwPNodeList) { if ((tekwp_i++) % 100 == 0) { Console.Write("\r" + tekwp_i); } //get E of this TEkwP string ENodeInText = tekwpNode.SelectSingleNode("E").InnerText; string kwNodeInText = tekwpNode.SelectSingleNode("kw").InnerText; string pScoreVal = tekwpNode.Attributes["pScore"].Value; double pScore = 0; double.TryParse(pScoreVal, out pScore); //get kw for this string[] kw = kwNodeInText.Split(",".ToCharArray()); //process each entity string[] E = ENodeInText.Split(",".ToCharArray()); //E-kw matrix foreach (string entity in E) { if (entity != null && entity != "") { if (!EkwDict.ContainsKey(entity)) { EkwDict.Add(entity, new EkwO(entity)); } EkwDict[entity].Occur++; EkwDict[entity].PScore += pScore; //register kws foreach (string keyw in kw) { if (keyw != null && keyw != "") { EkwDict[entity].AddKW(keyw, pScore); } } } }//built E-kw //kw-E matrix foreach (string keyword in kw) { if (keyword != null && keyword != "") { if (!kwEDict.ContainsKey(keyword)) { kwEDict.Add(keyword, new kwEo(keyword)); } kwEDict[keyword].Occur++; kwEDict[keyword].PScore += pScore; //register kws foreach (string en in E) { if (en != null && en != "") { kwEDict[keyword].AddEntity(en, pScore); } } } } //built kw-E } //E-kw bigraph with E-kw and kw-E Console.WriteLine("\r" + tekwp_i); //freshen up E-kw to remove lowfreq kw int kwCount_uf = 0, kwCount_f = 0; foreach (string entity in EkwDict.Keys) { kwCount_uf += EkwDict[entity].kw.Count; EkwDict[entity].freshen(0.35); kwCount_f += EkwDict[entity].kw.Count; } //keyword lexicon foreach (KeyValuePair <string, EkwO> ekwo in EkwDict) { EkwO this_Ekw = ekwo.Value; foreach (string kwl in this_Ekw.kw.Keys) { if (kwl != null && kwl != "") { if (!kwLexicon.ContainsKey(kwl)) { kwLexicon.Add(kwl, new kwEo(kwl)); } //kwLexicon[kwl].Occur++; //kwLexicon[kwl].PScore += this_Ekw.kw[kwl].PScore * this_Ekw.kw[kwl].Occur; kwLexicon[kwl].AddEntity(this_Ekw.E, this_Ekw.PScore); } } } Console.WriteLine("echo time to calculate: " + (TimeSpan)(DateTime.Now - startTime)); Console.WriteLine("storing in files"); fsk.WriteLine("keyword" + "," + "occur" + "," + "ln(occur)" + "," + "PScore" + "," + "E count" + "described entities"); foreach (string kwl in kwLexicon.Keys) { int occur = kwLexicon[kwl].Occur = kwEDict[kwl].Occur; kwLexicon[kwl].PScore = kwEDict[kwl].PScore / occur; if (occur > 1) { /*string all_E = ""; * foreach (string an_E in kwLexicon[kwl].E.Keys) all_E += an_E.Replace(",", "") + ";";*/ fsk.WriteLine(kwl + "," + occur + "," + Math.Log(occur) + "," + kwLexicon[kwl].PScore + "," + kwLexicon[kwl].E.Count /*+ "," + all_E*/); } } Console.WriteLine("good freq kw: " + kwCount_f + " all kw(raw count) " + kwCount_uf); Console.WriteLine(kwLexicon.Count + " kw in lexicon, unique kw"); //review int i = 0; fse.WriteLine("Entity" + "," + "occur" + "," + "ln(occur)" + "," + "PScore" + "," + "kw Count" + "," + "keywords"); foreach (KeyValuePair <string, EkwO> e_ekwo in EkwDict) { int occur = e_ekwo.Value.Occur; if (occur > 2) { /*string all_kw = ""; * foreach (string a_kw in e_ekwo.Value.kw.Keys) all_kw += a_kw.Replace(",","") + ";";*/ fse.WriteLine(e_ekwo.Key + "," + occur + "," + Math.Log(occur) + "," + e_ekwo.Value.PScore + "," + e_ekwo.Value.kw.Count /*+","+all_kw*/); i++; } } fse.Close(); fsk.Close(); Console.WriteLine(i + " of " + EkwDict.Count + " E occured more than twice"); Console.WriteLine("echo time to store and calculate: " + (TimeSpan)(DateTime.Now - startTime)); }