Exemple #1
0
        public void genEEgraph(string reportPath, int kwLimitThreshold = 350, int entityOccurThreshold = 2, double polarityThresh = 0.35, bool PoleInvariant = false, bool NoSelfEdge = true)
        {
            if (EkwEgg == null)
            {
                EkwEgg = new Dictionary <string, EEVertex>(StringComparer.InvariantCultureIgnoreCase);
            }
            Console.WriteLine("Generating EE graph");

            int legitKWCnt = 0;

            foreach (KeyValuePair <string, kwEo> cnt in kwLexicon)
            {
                if (cnt.Value.E.Count < kwLimitThreshold)
                {
                    legitKWCnt++;
                }
            }
            Console.WriteLine("from " + legitKWCnt + " legit kw");
            DateTime startTime = DateTime.Now;

            foreach (KeyValuePair <string, EkwO> ekwo in EkwDict)
            {
                string E      = ekwo.Key;
                EkwO   ekwp   = ekwo.Value;
                double pScore = ekwp.PScore;

                if (ekwp.Occur <= entityOccurThreshold)
                {
                    continue;
                }

                EkwEgg.Add(E, new EEVertex(E));

                foreach (string keyword in ekwp.kw.Keys)
                {
                    //got the linker
                    if (kwLexicon.ContainsKey(keyword) && kwLexicon[keyword].E.Count < kwLimitThreshold)
                    {
                        Dictionary <string, Meta> PotentialNeighbors = kwLexicon[keyword].E;
                        foreach (string nborE in PotentialNeighbors.Keys)
                        {
                            if (nborE == E && NoSelfEdge)
                            {
                                continue;
                            }
                            double   nborPScore = EkwDict[nborE].PScore;
                            polarity pole       = polarity.opn;
                            if (nborPScore >= polarityThresh && pScore >= polarityThresh)
                            {
                                pole = polarity.pos;
                            }
                            else if (nborPScore <= -polarityThresh && pScore <= -polarityThresh)
                            {
                                pole = polarity.neg;
                            }
                            else if (nborPScore > 0 && pScore > 0)
                            {
                                if (nborPScore + pScore > 2 * polarityThresh)
                                {
                                    pole = polarity.pos;
                                }
                                else
                                {
                                    pole = polarity.obj;
                                }
                            }
                            else if (nborPScore < 0 && pScore < 0)
                            {
                                if (nborPScore + pScore > 2 * polarityThresh)
                                {
                                    pole = polarity.neg;
                                }
                                else
                                {
                                    pole = polarity.obj;
                                }
                            }
                            else if (nborPScore == 0 || pScore == 0)
                            {
                                pole = polarity.obj;
                            }

                            if (pole != polarity.opn || PoleInvariant)
                            {
                                EkwEgg[E].AddNeighbor(nborE, keyword, pole);
                            } //added nbor
                        }     //finding nbors by kw
                    }         //this kw is in lex
                }             //iterating over kw
            }                 //iterating over EkwDict

            //cleanup
            foreach (string node in EkwEgg.Keys)
            {
                string[] nborvs = new string[EkwEgg[node].Neighbors.Keys.Count];
                int      i      = 0;
                foreach (string nbk in EkwEgg[node].Neighbors.Keys)
                {
                    nborvs[i++] = nbk;
                }
                foreach (string nbork in nborvs)
                {
                    if (!EkwEgg.ContainsKey(nbork))
                    {
                        EkwEgg[node].Neighbors.Remove(nbork);
                    }
                }
            }
            Console.WriteLine("and " + EkwEgg.Count + " entities");
            Console.WriteLine("EkwEG time to generate EE graph: " + (TimeSpan)(DateTime.Now - startTime));

            StreamWriter fs = new StreamWriter(reportPath + "-" + kwLimitThreshold + "," + entityOccurThreshold + ".EkwEGg.csv", false);

            foreach (KeyValuePair <string, EEVertex> evert in EkwEgg)
            {
                fs.Write(evert.Key + "," + evert.Value.Neighbors.Count + ",");
                foreach (string nbor in evert.Value.Neighbors.Keys)
                {
                    fs.Write(nbor + ";");
                }
                fs.WriteLine();
            }
            fs.Close();
            Console.WriteLine("EkwEG time to store & generate EE graph: " + (TimeSpan)(DateTime.Now - startTime));
        }
Exemple #2
0
        public void genEkwBigraph(string TEkwPpath, string reportPath = "a")
        {
            DateTime    startTime = DateTime.Now;
            XmlDocument xDoc      = new XmlDocument();

            xDoc.Load(TEkwPpath);
            xDoc.Normalize();
            XmlNode root = xDoc.DocumentElement;


            StreamWriter fse = new StreamWriter(reportPath + ".E.csv", false);
            StreamWriter fsk = new StreamWriter(reportPath + ".kw.csv", false);

            XmlNodeList TEkwPNodeList = root.SelectNodes("//TEkwP");

            Console.WriteLine("E-kw");
            Console.WriteLine("echo: building E-kw bigraph with E-kw matrix and kw-E matrix + kwLexicon");

            int tekwp_i = 0;

            foreach (XmlNode tekwpNode in TEkwPNodeList)
            {
                if ((tekwp_i++) % 100 == 0)
                {
                    Console.Write("\r" + tekwp_i);
                }

                //get E of this TEkwP
                string ENodeInText  = tekwpNode.SelectSingleNode("E").InnerText;
                string kwNodeInText = tekwpNode.SelectSingleNode("kw").InnerText;
                string pScoreVal    = tekwpNode.Attributes["pScore"].Value;
                double pScore       = 0;
                double.TryParse(pScoreVal, out pScore);
                //get kw for this
                string[] kw = kwNodeInText.Split(",".ToCharArray());
                //process each entity
                string[] E = ENodeInText.Split(",".ToCharArray());

                //E-kw matrix
                foreach (string entity in E)
                {
                    if (entity != null && entity != "")
                    {
                        if (!EkwDict.ContainsKey(entity))
                        {
                            EkwDict.Add(entity, new EkwO(entity));
                        }
                        EkwDict[entity].Occur++;
                        EkwDict[entity].PScore += pScore;
                        //register kws
                        foreach (string keyw in kw)
                        {
                            if (keyw != null && keyw != "")
                            {
                                EkwDict[entity].AddKW(keyw, pScore);
                            }
                        }
                    }
                }//built E-kw

                //kw-E matrix
                foreach (string keyword in kw)
                {
                    if (keyword != null && keyword != "")
                    {
                        if (!kwEDict.ContainsKey(keyword))
                        {
                            kwEDict.Add(keyword, new kwEo(keyword));
                        }
                        kwEDict[keyword].Occur++;
                        kwEDict[keyword].PScore += pScore;
                        //register kws
                        foreach (string en in E)
                        {
                            if (en != null && en != "")
                            {
                                kwEDict[keyword].AddEntity(en, pScore);
                            }
                        }
                    }
                } //built kw-E
            }     //E-kw bigraph with E-kw and kw-E
            Console.WriteLine("\r" + tekwp_i);

            //freshen up E-kw to remove lowfreq kw
            int kwCount_uf = 0, kwCount_f = 0;

            foreach (string entity in EkwDict.Keys)
            {
                kwCount_uf += EkwDict[entity].kw.Count;
                EkwDict[entity].freshen(0.35);
                kwCount_f += EkwDict[entity].kw.Count;
            }
            //keyword lexicon
            foreach (KeyValuePair <string, EkwO> ekwo in EkwDict)
            {
                EkwO this_Ekw = ekwo.Value;
                foreach (string kwl in this_Ekw.kw.Keys)
                {
                    if (kwl != null && kwl != "")
                    {
                        if (!kwLexicon.ContainsKey(kwl))
                        {
                            kwLexicon.Add(kwl, new kwEo(kwl));
                        }
                        //kwLexicon[kwl].Occur++;
                        //kwLexicon[kwl].PScore += this_Ekw.kw[kwl].PScore * this_Ekw.kw[kwl].Occur;
                        kwLexicon[kwl].AddEntity(this_Ekw.E, this_Ekw.PScore);
                    }
                }
            }


            Console.WriteLine("echo time to calculate: " + (TimeSpan)(DateTime.Now - startTime));
            Console.WriteLine("storing in files");
            fsk.WriteLine("keyword" + "," + "occur" + "," + "ln(occur)" + "," + "PScore" + "," + "E count" + "described entities");
            foreach (string kwl in kwLexicon.Keys)
            {
                int occur = kwLexicon[kwl].Occur = kwEDict[kwl].Occur;
                kwLexicon[kwl].PScore = kwEDict[kwl].PScore / occur;
                if (occur > 1)
                {
                    /*string all_E = "";
                     * foreach (string an_E in kwLexicon[kwl].E.Keys) all_E += an_E.Replace(",", "") + ";";*/
                    fsk.WriteLine(kwl + "," + occur + "," + Math.Log(occur) + "," + kwLexicon[kwl].PScore + "," + kwLexicon[kwl].E.Count /*+ "," + all_E*/);
                }
            }
            Console.WriteLine("good freq kw: " + kwCount_f + " all kw(raw count) " + kwCount_uf);
            Console.WriteLine(kwLexicon.Count + " kw in lexicon, unique kw");
            //review
            int i = 0;

            fse.WriteLine("Entity" + "," + "occur" + "," + "ln(occur)" + "," + "PScore" + "," + "kw Count" + "," + "keywords");
            foreach (KeyValuePair <string, EkwO> e_ekwo in EkwDict)
            {
                int occur = e_ekwo.Value.Occur;
                if (occur > 2)
                {
                    /*string all_kw = "";
                     * foreach (string a_kw in e_ekwo.Value.kw.Keys) all_kw += a_kw.Replace(",","") + ";";*/
                    fse.WriteLine(e_ekwo.Key + "," + occur + "," + Math.Log(occur) + "," + e_ekwo.Value.PScore + "," + e_ekwo.Value.kw.Count /*+","+all_kw*/);
                    i++;
                }
            }
            fse.Close();
            fsk.Close();
            Console.WriteLine(i + " of " + EkwDict.Count + " E occured more than twice");
            Console.WriteLine("echo time to store and calculate: " + (TimeSpan)(DateTime.Now - startTime));
        }