Example #1
0
        private void IDFPreprocessDocsDialog_FileOk(object sender, CancelEventArgs e)
        {
            string[] fileNames = this.IDFPreprocessDocsDialog.FileNames;

            /*
             * ArrayList docs = new ArrayList();
             *
             * this.progressBar.Show();
             * this.progressBar.Minimum = 0;
             * this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4);
             * this.progressBar.Value = 0;
             *
             * foreach (string fileName in fileNames)
             * {
             *  string fileText = File.ReadAllText(fileName, Encoding.Default);
             *  Document doc = docProcessor.process(fileText);
             *  docs.Add(doc);
             *  this.progressBar.Increment(1);
             * }
             *
             * this.idf = IDF.fromDocuments(docs);
             *
             * this.progressBar.Value = this.progressBar.Maximum;
             * this.progressBar.Hide();
             * //*/

            this.idf = IDF.IDFGenerator.fromFiles(fileNames);

            this.IDFSaveDialog.ShowDialog();
        }
Example #2
0
        public void TestMethodDownload()
        {
            ResetLog();//重设日志
            //SynchronizationContext contex = new SynchronizationContext();
            //SynchronizationContext.SetSynchronizationContext(contex);

            //这样调用它没有跳回原来的线程
            Runing.Increment.Log.Info("TestMethodDownload():当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);

            Console.WriteLine("UnitTestXML.GenerateXML():生成xml文件");
            IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip");

            //FileHelper.CleanDirectory("../test/Temp");

            bool isDone = false;

            IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup")
            .OnDownloadSuccess((obj) =>
            {
                Runing.Increment.Log.Info("TestMethodDownload():进入OnDownloadSuccess当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);
                isDone = true;
            })
            .OnError((e) =>
            {
                Runing.Increment.Log.Info("TestMethodDownload():进入OnError当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);

                Assert.Fail();   //下载错误,认为不通过
                isDone = true;
            }).Go();

            while (!isDone)
            {
                Thread.Sleep(15);
            }
        }
        private void load(string clusterDir)
        {
            DocumentProcessor docProcessor = new DocumentProcessor();
            ArrayList         docs         = new ArrayList();

            string[] clusterFiles = Directory.GetFiles(clusterDir, "*.txt", SearchOption.TopDirectoryOnly);

            foreach (string filename in clusterFiles)
            {
                string fileText = File.ReadAllText(filename, Encoding.Default);

                Document doc = docProcessor.process(fileText);

                docs.Add(doc);
            }

            DocsStatistics docStats = DocsStatistics.generateStatistics(docs);
            Hashtable      centroid = new Hashtable();

            foreach (string word in docStats.wordsCount.Keys)
            {
                //centroid[firstWord] = (((int)docStats.wordsCount[firstWord]) * idf(docStats, firstWord)) / docs.Count;
                centroid[word] = (((int)docStats.wordsCount[word]) * IDF.getInstance().get(word)) / docs.Count;
            }

            this.centroidWords = applyKeepWords(centroid, this.keepWords);
        }
Example #4
0
 private void MainForm_Load(object sender, EventArgs e)
 {
     Trace.setInstance(new TextBoxTrace(this.TraceTxt));
     //Trace.setInstance(new NullTrace());
     IDF.setInstance(IDF.fromFile(Conf.IDFFILE_PATH));
     this.AlgorithmCmbo.SelectedIndex = 0;
 }
Example #5
0
        public void TestMethodDownloadAsync()
        {
            ResetLog();//重设日志
            bool isDone = false;
            bool isFail = false;

            //使用这个线程它也不会重新跳回原来的线程
            Task.Run(() =>
            {
                Runing.Increment.Log.Info("TestMethodDownload():当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);

                Console.WriteLine("UnitTestXML.GenerateXML():生成xml文件");
                IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip");

                //FileHelper.CleanDirectory("../test/Temp");

                IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup")
                .OnDownloadSuccess((obj) =>
                {
                    Runing.Increment.Log.Info("TestMethodDownload():进入OnDownloadSuccess当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);
                    isDone = true;
                })
                .OnError((e) =>
                {
                    Runing.Increment.Log.Info("TestMethodDownload():进入OnError当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);
                    isDone = true;
                    isFail = true;
                }).Go();
            });
            while (!isDone)
            {
                Thread.Sleep(15);
            }
            Assert.IsFalse(isFail);
        }
Example #6
0
        private void button1_Click(object sender, EventArgs ea)
        {
            ResetLog();//重设日志

            Runing.Increment.Log.Info("TestMethodDownload():当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);

            Console.WriteLine("UnitTestXML.GenerateXML():生成xml文件");
            IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip");

            //FileHelper.CleanDirectory("../test/Temp");

            //bool isDone = false;
            IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup")
            .OnDownloadSuccess((obj) =>
            {
                Runing.Increment.Log.Info("TestMethodDownload():进入OnDownloadSuccess当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);
                //isDone = true;
            })
            .OnError((e) =>
            {
                Runing.Increment.Log.Info("TestMethodDownload():进入OnError当前执行线程id=" + Thread.CurrentThread.ManagedThreadId);

                //isDone = true;
            }).Go();
        }
Example #7
0
        public void TestIDFModel()
        {
            int    expectedDocFrequency = 1980;
            string expectedInputCol     = "rawFeatures";
            string expectedOutputCol    = "features";

            DataFrame sentenceData =
                _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");

            Tokenizer tokenizer = new Tokenizer()
                                  .SetInputCol("sentence")
                                  .SetOutputCol("words");

            DataFrame wordsData = tokenizer.Transform(sentenceData);

            HashingTF hashingTF = new HashingTF()
                                  .SetInputCol("words")
                                  .SetOutputCol(expectedInputCol)
                                  .SetNumFeatures(20);

            DataFrame featurizedData = hashingTF.Transform(wordsData);

            IDF idf = new IDF()
                      .SetInputCol(expectedInputCol)
                      .SetOutputCol(expectedOutputCol)
                      .SetMinDocFreq(expectedDocFrequency);

            IDFModel idfModel = idf.Fit(featurizedData);

            DataFrame rescaledData = idfModel.Transform(featurizedData);

            Assert.Contains(expectedOutputCol, rescaledData.Columns());

            Assert.Equal(expectedInputCol, idfModel.GetInputCol());
            Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
            Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string modelPath = Path.Join(tempDirectory.Path, "idfModel");
                idfModel.Save(modelPath);

                IDFModel loadedModel = IDFModel.Load(modelPath);
                Assert.Equal(idfModel.Uid(), loadedModel.Uid());
            }

            TestFeatureBase(idfModel, "minDocFreq", 1000);
        }
Example #8
0
        public void TestMethodNoServer()
        {
            ResetLog(false);//重设日志

            bool isDone = false;

            Runing.Increment.Log.Info("UnitTest1.TestMethodNoServer(): http://127.0.0.1:11122");

            IDF.Update("http://127.0.0.1:11122/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup")
            .OnMoveFileDone((obj, success) => { isDone = true; })
            .OnDownloadSuccess((obj) => { obj.MoveFile(); })
            .OnError((e) => { isDone = true; })
            .Go();

            while (!isDone)
            {
                Thread.Sleep(50);
            }
            isDone = false;

            Runing.Increment.Log.Info("UnitTest1.TestMethodNoServer(): http://baidu.com/IDFTest.zip");
            IDF.Update("http://baidu.com/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup")
            .OnMoveFileDone((obj, success) => { isDone = true; })
            .OnDownloadSuccess((obj) => { obj.MoveFile(); })
            .OnError((e) => { isDone = true; })
            .Go();

            while (!isDone)
            {
                Thread.Sleep(50);
            }
            isDone = false;

            Runing.Increment.Log.Info("UnitTest1.TestMethodNoServer(): http://www.google.com");
            IDF.Update("http://www.google.com/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup")
            .OnMoveFileDone((obj, success) => { isDone = true; })
            .OnDownloadSuccess((obj) => { obj.MoveFile(); })
            .OnError((e) => { isDone = true; })
            .Go();

            while (!isDone)
            {
                Thread.Sleep(50);
            }
            isDone = false;
        }
Example #9
0
        private static (IDFModel, DataFrame) GetModelAndNormalizedDataFrame(string sourceDir,
                                                                            Tokenizer tokenizer, HashingTF hashingTF)
        {
            var sourceDocuments = toDF(GetSourceFiles(sourceDir));
            var words           = tokenizer.Transform(sourceDocuments);
            var featurizedData  = hashingTF.Transform(words);

            var idf = new IDF()
                      .SetInputCol("rawFeatures")
                      .SetOutputCol("features");
            var idfModel = idf.Fit(featurizedData);

            var rescaled = idfModel.Transform(featurizedData);
            var filtered = rescaled.Select("Path", "features");

            return(idfModel, filtered.WithColumn("norm", udfCalcNorm(Col("features"))));
        }
Example #10
0
        public static double idfModifiedCos(IDF idf, Sentence firstSentence, Sentence secondSentence)
        {
            double idfModifiedCosine = 0;

            HashSet <string> commonWords = new HashSet <string>();

            foreach (string aWord in firstSentence.words)
            {
                if (secondSentence.words.Contains(aWord))
                {
                    commonWords.Add(aWord);
                }
            }

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf.get(aWord), 2));
            }

            double denominator1 = 0;

            foreach (string aWord in firstSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf.get(aWord), 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in secondSentence.words)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf.get(aWord), 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            idfModifiedCosine = numerator / (denominator1 * denominator2);

            return(idfModifiedCosine);
        }
Example #11
0
        override public string generateSummary(Document newDoc, double compressionRatio)
        {
            //Document newDoc = Document.process(newDocText);
            //Document newDoc = Conf.getDocumentProcessor().process(newDocText);

            foreach (Sentence aSent in newDoc.sentences)
            {
                calcSentenceWeight(IDF.getInstance(), newDoc, aSent);
            }

            //object[] sents = newDoc.sentences.ToArray();
            Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(typeof(Sentence));

            string genSummary = "";

            genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio);

            /*
             * Array.Sort(sents, new SentenceComparer());
             * Array.Reverse(sents);
             *
             *
             *
             * int numSents = NUM_SENTENCES;
             * if (sents.Length < numSents)
             *  numSents = sents.Length;
             *
             * for (int i = 0; i < numSents; i++)
             * {
             *  genSummary += ((Sentence)sents[i]).fullText + "\r\n";
             * }
             * //*/

            /*
             * string dbgString = "";
             * foreach (Sentence aSent in sents)
             * {
             *  dbgString += aSent.fullText + "\r\n";
             * }
             *
             * debugClipboard(dbgString);
             * //*/

            return(genSummary);
        }
Example #12
0
    public void extract_trigger_words(string[] contexts, int m)
    {
        int N = contexts.Length;

        List<IDF> idfs = new List<IDF>();

        foreach (string context in contexts.ToList())
        {
            string[] tokens = context.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

            foreach (string token in tokens.ToList().Where(t => t != "-TITLE-").Distinct())
            {
                if (idfs.Where(i => i.Name == token).Count() > 0)
                {
                    var idf = idfs.Where(i => i.Name == token).First();
                    idf.Counted++;
                }
                else
                {
                    var idf = new IDF() {
                       Name = token,
                       Counted = 1,
                       Value = 0
                    };

                    idfs.Add(idf);
                }
            }
        }

        foreach (var idf in idfs)
        {
            double value = Math.Log(N / idf.Counted);
            idf.Value = value;
        }

        foreach (var idf in idfs.OrderByDescending(i => i.Value).ThenBy(i => i.Name, StringComparer.Ordinal).Take(m))
        {
            System.Console.WriteLine(idf.Name);
        }
    }
Example #13
0
        public void IDFTest()
        {
            var doc1 = new Document(new List <string>()
            {
                "АААА", "АААА", "ББББ", "ЗАС"
            });
            var doc2 = new Document(new List <string>()
            {
                "АААА", "АААА", "ББББ", "ББББ", "ББББ"
            });
            var documents = new Dictionary <string, Document>
            {
                ["1"] = doc1,
                ["2"] = doc2
            };
            IDF idfEvaluator = new IDF(documents);

            Assert.AreEqual(Math.Log10((double)documents.Count / 2), idfEvaluator.Evaluate(doc1[0]), eps);
            Assert.AreEqual(Math.Log10((double)documents.Count / 2), idfEvaluator.Evaluate(doc1[2]), eps);
            Assert.AreEqual(Math.Log10((double)documents.Count / 1), idfEvaluator.Evaluate(doc1[3]), eps);
        }
Example #14
0
        public static double[][] generateIdfModifiedCosineMatrix(IDF idf, ArrayList sentences)
        {
            double[][] idfModifiedCosine = new double[sentences.Count][];

            for (int i = 0; i < sentences.Count; i++)
            {
                idfModifiedCosine[i] = new double[sentences.Count];
            }

            for (int i = 0; i < sentences.Count; i++)
            {
                Sentence firstSent = (Sentence)sentences[i];

                for (int j = 0; j < sentences.Count; j++)
                {
                    // same sentence then 1
                    //*
                    if (i == j)
                    {
                        idfModifiedCosine[i][j] = 1;
                        continue;
                    }
                    //*/

                    // has been processed before
                    if (idfModifiedCosine[i][j] != 0)
                    {
                        continue;
                    }

                    Sentence secondSent = (Sentence)sentences[j];

                    idfModifiedCosine[i][j] = idfModifiedCos(idf, firstSent, secondSent);
                    idfModifiedCosine[j][i] = idfModifiedCosine[i][j];
                }
            }

            return(idfModifiedCosine);
        }
Example #15
0
        public static double sim(IDF idf, Hashtable first, Hashtable second)
        {
            double similarity = 0;

            HashSet <string> commonWords = SummaryUtil.getCommonWords(new ArrayList(first.Keys), new ArrayList(second.Keys));

            double numerator = 0;

            foreach (string aWord in commonWords)
            {
                numerator += ((double)first[aWord] * (double)second[aWord] * idf.get(aWord));
            }

            double denominator1 = 0;

            foreach (string aWord in first.Keys)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator1 += Math.Pow((double)first[aWord], 2);
            }

            denominator1 = Math.Sqrt(denominator1);

            double denominator2 = 0;

            foreach (string aWord in second.Keys)
            {
                //if (docStats.wordRefs[aWord] != null)
                denominator2 += Math.Pow((double)second[aWord], 2);
            }

            denominator2 = Math.Sqrt(denominator2);

            similarity = numerator / (denominator1 * denominator2);

            return(similarity);
        }
Example #16
0
        public void TestIDFModel()
        {
            string expectedInputCol     = "rawFeatures";
            string expectedOutputCol    = "features";
            int    expectedDocFrequency = 100;

            IDF idf = new IDF()
                      .SetInputCol(expectedInputCol)
                      .SetOutputCol(expectedOutputCol)
                      .SetMinDocFreq(expectedDocFrequency);

            Assert.Equal(expectedInputCol, idf.GetInputCol());
            Assert.Equal(expectedOutputCol, idf.GetOutputCol());
            Assert.Equal(expectedDocFrequency, idf.GetMinDocFreq());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "IDF");
                idf.Save(savePath);

                IDF loadedIdf = IDF.Load(savePath);
                Assert.Equal(idf.Uid(), loadedIdf.Uid());
            }
        }
Example #17
0
        override public string generateSummary(Document newDoc, double compressionRatio)
        {
            double[] cTotal = new double[newDoc.sentences.Count];
            double[] pTotal = new double[newDoc.sentences.Count];
            double[] fTotal = new double[newDoc.sentences.Count];
            double   cMax   = double.MinValue;

            ArrayList centroids = buildCentroids(this.trainingDocs, IDF.getInstance());

            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                Sentence currSent = (Sentence)newDoc.sentences[i];

                // Calculate C
                cTotal[i] = 0;
                foreach (string word in currSent.words)
                {
                    /*
                     * double tf = termFrequency(docStats, firstWord);
                     * double idf = CentroidAlgorithm.idf(docStats, firstWord);
                     * cTotal[i] += tf * idf;
                     * //*/

                    cTotal[i] += getCentroidValue(centroids, word);
                }

                if (cTotal[i] > cMax)
                {
                    cMax = cTotal[i];
                }

                // Calculate F
                fTotal[i] = 0;

                foreach (string word in currSent.words)
                {
                    int wordOccurence = 0;

                    if (newDoc.title.wordsCount[word] != null)
                    {
                        wordOccurence += ((int)newDoc.title.wordsCount[word]);
                    }

                    if (newDoc.sentences.Count > 1)
                    {
                        if (((Sentence)newDoc.sentences[0]).wordsCount[word] != null)
                        {
                            wordOccurence += ((int)((Sentence)newDoc.sentences[0]).wordsCount[word]);
                        }
                    }

                    fTotal[i] += (wordOccurence * ((int)currSent.wordsCount[word]));
                }
            }

            // Calculate P
            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                // Remove + 1 as arrays are zero based.
                pTotal[i] = ((newDoc.sentences.Count - i) * cMax) / newDoc.sentences.Count;
            }

            double maxScore = double.MinValue;

            for (int i = 0; i < newDoc.sentences.Count; i++)
            {
                double currWeight = (this.centroidWeight * cTotal[i]) + (this.positionalWeight * pTotal[i]) + (this.firstSentenceWeight * fTotal[i]);

                ((Sentence)newDoc.sentences[i]).weight = currWeight;

                if (currWeight > maxScore)
                {
                    maxScore = currWeight;
                }
            }

            string genSummary     = null;
            string prevgenSummary = null;

            do
            {
                for (int i = 0; i < newDoc.sentences.Count; i++)
                {
                    for (int j = 0; j < newDoc.sentences.Count; j++)
                    {
                        if (i >= j)
                        {
                            continue;
                        }

                        double redundancy = redundancyPenalty((Sentence)newDoc.sentences[i], (Sentence)newDoc.sentences[j]);

                        ((Sentence)newDoc.sentences[j]).weight -= (maxScore * redundancy);
                    }
                }

                maxScore = double.MinValue;

                for (int i = 0; i < newDoc.sentences.Count; i++)
                {
                    if (((Sentence)newDoc.sentences[i]).weight > maxScore)
                    {
                        maxScore = ((Sentence)newDoc.sentences[i]).weight;
                    }
                }

                Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(new Sentence().GetType());

                prevgenSummary = genSummary;

                genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, this.compressionRatio);
            } while (!genSummary.Equals(prevgenSummary));

            return(genSummary);
        }
Example #18
0
        //override public string generateSummary(DocsStatistics docStats, Document newDoc)
        override public string generateSummary(ArrayList docs, double compressionRatio)
        {
            string genSummary = "";

            ArrayList allSents = new ArrayList();

            foreach (Document doc in docs)
            {
                allSents.AddRange(doc.sentences);
            }

            double[][] idfModifiedCosineMatrix = LexRankCommon.generateIdfModifiedCosineMatrix(IDF.getInstance(), allSents);

            //*
            Trace.write(" IDF Cosine Matrix : ");
            Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix));
            //*/

            double[] sentDegree = new double[allSents.Count];

            for (int i = 0; i < sentDegree.Length; i++)
            {
                sentDegree[i] = 0;
            }

            for (int i = 0; i < idfModifiedCosineMatrix.Length; i++)
            {
                for (int j = 0; j < idfModifiedCosineMatrix[i].Length; j++)
                {
                    /*
                     * if (i == j)
                     *  continue;
                     * //*/
                    if (idfModifiedCosineMatrix[i][j] > this.threshold)
                    {
                        idfModifiedCosineMatrix[i][j] = 1;
                        sentDegree[i]++;
                    }
                    else
                    {
                        idfModifiedCosineMatrix[i][j] = 0;
                    }
                }
            }

            Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix));

            for (int i = 0; i < idfModifiedCosineMatrix.Length; i++)
            {
                for (int j = 0; j < idfModifiedCosineMatrix[i].Length; j++)
                {
                    idfModifiedCosineMatrix[i][j] = idfModifiedCosineMatrix[i][j] / sentDegree[i];
                    idfModifiedCosineMatrix[i][j] = (dampingFactor / idfModifiedCosineMatrix.Length) + ((1 - dampingFactor) * idfModifiedCosineMatrix[i][j]);
                }
            }

            Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix));

            double[] weights = LexRankCommon.powerMethod(idfModifiedCosineMatrix, 0.1);

            for (int i = 0; i < allSents.Count; i++)
            {
                ((Sentence)allSents[i]).weight = weights[i];
            }

            Sentence[] sents = (Sentence[])allSents.ToArray(new Sentence().GetType());

            genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio);

            /*
             * Array.Sort(sents, new SentenceComparer());
             * Array.Reverse(sents);
             *
             * foreach (Sentence sent in sents)
             * {
             *  Trace.write(sent.fullText);
             *  Trace.write("Weight : " + sent.weight);
             * }
             *
             * genSummary = getText(sents);
             * //*/
            return(genSummary);
        }
        //load venues into combobox
        private void Form2_Load(object sender, EventArgs e)
        {
            string venueQuery = "SELECT Venue.VenueID, Venue.VenueName FROM [Venue];";
            string venueSpaceQuery = "SELECT VenueSpace.VenueSpaceID, VenueSpace.VenueSpaceName, VenueSpace.VenueID FROM [VenueSpace];";
            string idfQuery = "SELECT IDF.IDFID, IDF.IDFName, IDF.VenueSpaceID FROM [IDF];";
            string ppQuery = "SELECT PatchPanel.PatchPanelID, PatchPanel.PatchPanelName, PatchPanel.IDFID FROM [PatchPanel];";
            string switchQuery = "SELECT Switch.SwitchID, Switch.DNSName, Switch.IDFID FROM [Switch];";
            string ppPortQuery = "SELECT PatchPanelPort.PatchPanelPortID, PatchPanelPort.PatchPanelPortNum, PatchPanelPort.PatchPanelID FROM [PatchPanelPort] WHERE PatchPanelPort.SwitchPortID IS NULL;";
            string switchPortQuery = "SELECT SwitchPort.SwitchPortID, SwitchPort.SwitchPortNum, SwitchPort.SwitchID FROM [SwitchPort] WHERE SwitchPort.PatchPanelPortID IS NULL;";

            DataSet venDS = getDataSet(venueQuery);
            DataSet vsDS = getDataSet(venueSpaceQuery);
            DataSet idfDS = getDataSet(idfQuery);
            DataSet swDS = getDataSet(switchQuery);
            DataSet ppDS = getDataSet(ppQuery);
            DataSet swpDS = getDataSet(switchPortQuery);
            DataSet pppDS = getDataSet(ppPortQuery);

            try
            {
                foreach (DataRow dr in venDS.Tables[0].Rows)
                {
                    Venue venue = new Venue();
                    venue.venueID = (int)dr.ItemArray.GetValue(0);
                    venue.venueName = dr.ItemArray.GetValue(1).ToString();
                    venueList.Add(venue);
                }
                foreach (DataRow dr in vsDS.Tables[0].Rows)
                {
                    VenueSpace venueSpace = new VenueSpace();
                    venueSpace.venueSpaceID = (int)dr.ItemArray.GetValue(0);
                    venueSpace.venueSpaceName = dr.ItemArray.GetValue(1).ToString();
                    venueSpace.venueID = (int)dr.ItemArray.GetValue(2);
                    venueSpaceList.Add(venueSpace);
                }
                foreach (DataRow dr in idfDS.Tables[0].Rows)
                {
                    IDF idf = new IDF();
                    idf.idfID = (int)dr.ItemArray.GetValue(0);
                    idf.idfName = dr.ItemArray.GetValue(1).ToString();
                    idf.venueSpaceID = (int)dr.ItemArray.GetValue(2);
                    idfList.Add(idf);
                }
                foreach (DataRow dr in swDS.Tables[0].Rows)
                {
                    Switch sw = new Switch();
                    sw.switchID = (int)dr.ItemArray.GetValue(0);
                    sw.switchNameDNS = dr.ItemArray.GetValue(1).ToString();
                    sw.idfID = (int)dr.ItemArray.GetValue(2);
                    switchList.Add(sw);
                }
                foreach (DataRow dr in ppDS.Tables[0].Rows)
                {
                    PatchPanel pp = new PatchPanel();
                    pp.patchPanelID = (int)dr.ItemArray.GetValue(0);
                    pp.patchPanelName = dr.ItemArray.GetValue(1).ToString();
                    pp.idfID = (int)dr.ItemArray.GetValue(2);
                    PPList.Add(pp);
                }
                foreach (DataRow dr in swpDS.Tables[0].Rows)
                {
                    SwitchPort swp = new SwitchPort();
                    swp.switchPortID = (int)dr.ItemArray.GetValue(0);
                    swp.switchPortNum = (int)dr.ItemArray.GetValue(1);
                    swp.switchID = (int)dr.ItemArray.GetValue(2);
                    switchPortList.Add(swp);
                }
                foreach (DataRow dr in pppDS.Tables[0].Rows)
                {
                    PatchPanelPort ppp = new PatchPanelPort();
                    ppp.patchPanelPortID = (int)dr.ItemArray.GetValue(0);
                    ppp.patchPanelPortNum = (int)dr.ItemArray.GetValue(1);
                    ppp.patchPanelID = (int)dr.ItemArray.GetValue(2);
                    PPPList.Add(ppp);
                }

            }
            catch (OleDbException exp)
            {
                MessageBox.Show("Database Error:" + exp.Message.ToString());
            }

            updateVenueBoxList(venueList);
            updateDataGridView();
        }
        private List<IDF> getIDFList()
        {
            List<IDF> idfs = new List<IDF>();

            string idfQuery = "SELECT IDF.IDFID, IDF.IDFName, VenueSpace.VenueSpaceName, Venue.VenueName FROM [IDF], [VenueSpace], [Venue] WHERE IDF.VenueSpaceID = VenueSpace.VenueSpaceID AND VenueSpace.VenueID = Venue.VenueID;";
            DataSet idfDS = getDataSet(idfQuery);

            foreach (DataRow dr in idfDS.Tables[0].Rows)
            {
                IDF idf = new IDF();
                idf.idfString = dr.ItemArray.GetValue(0).ToString();
                idf.name = dr.ItemArray.GetValue(1).ToString();
                idf.venueSpaceName = dr.ItemArray.GetValue(2).ToString();
                idf.venueName = dr.ItemArray.GetValue(3).ToString();
                idf.idfString = idf.name + ", " + idf.venueSpaceName + ", " + idf.venueName;
                idfs.Add(idf);
            }
            idfs.OrderBy(i => i.venueName);

            return idfs;
        }
Example #21
0
        public ArrayList buildCentroids(ArrayList docs, IDF idfdb)
        {
            ArrayList centroids = new ArrayList();

            foreach (Document doc in docs)
            {
                ArrayList currDoc = new ArrayList();
                currDoc.Add(doc);

                DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc);

                Hashtable docVector = new Hashtable();

                foreach (DictionaryEntry entry in currDocStats.wordsCount)
                {
                    string word  = (string)entry.Key;
                    int    count = (int)entry.Value;

                    //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord);
                    double idf = idfdb.get(word);

                    if (idf < this.idfThreshold)
                    {
                        continue;
                    }

                    double tfidf = ((double)count) * idf;

                    docVector[word] = tfidf;
                }

                if (centroids.Count == 0)
                {
                    Centroid centroid = new Centroid(docVector, this.keepWords);
                    centroid.noOfDocuments = 1;

                    centroids.Add(centroid);
                }
                else
                {
                    Centroid nearestCentroid = null;
                    double   maxSimilarity   = double.MinValue;

                    foreach (Centroid centroid in centroids)
                    {
                        double similarity = sim(IDF.getInstance(), centroid.values, docVector);

                        if (similarity > simThreshold)
                        {
                            if (similarity > maxSimilarity)
                            {
                                maxSimilarity   = similarity;
                                nearestCentroid = centroid;
                            }
                        }
                    }

                    if (nearestCentroid == null)
                    {
                        nearestCentroid = new Centroid(docVector, this.keepWords);
                        centroids.Add(nearestCentroid);
                    }
                    else
                    {
                        nearestCentroid.addDocument(docVector);
                    }
                }
            }

            // Apply the KEEP_WORDS parameter for each centroid

            /*
             * foreach (Centroid centroid in centroids)
             * {
             *  Hashtable centroidValues = centroid.values;
             *
             *  DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count];
             *
             *  centroidValues.CopyTo(centValuesArr, 0);
             *
             *  Array.Sort(centValuesArr, new DictionaryEntryValueComparer());
             *  Array.Reverse(centValuesArr);
             *
             *  DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords];
             *
             *  Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords);
             *
             *  Hashtable finalCentroidValues = new Hashtable();
             *
             *  foreach (DictionaryEntry entry in finalCentroidValuesArr)
             *  {
             *      finalCentroidValues.Add(entry.Key, entry.Value);
             *  }
             *
             *  centroid.values = finalCentroidValues;
             * }
             * //*/

            //*
            foreach (Centroid centroid in centroids)
            {
                centroid.applyKeepWords();
            }
            //*/

            // Trace

            /*
             * int i = 0;
             * foreach (Centroid centroid in centroids)
             * {
             *  Trace.write("Centroid #" + (++i));
             *  foreach (DictionaryEntry entry in centroid.values)
             *  {
             *      Trace.write(entry.Key + " : " + entry.Value);
             *  }
             * }
             * //*/

            return(centroids);
        }
        private static void Main(string[] args)
        {
            var spark = SparkSession
                        .Builder()
                        .AppName("TF-IDF Application")
                        .GetOrCreate();

            var documentPath = args[0];
            var search       = args[1];

            var documentData = GetDocuments(documentPath);

            var documents = spark.CreateDataFrame(documentData, new StructType(
                                                      new List <StructField>
            {
                new StructField("title", new StringType()),
                new StructField("content", new StringType())
            }));

            var tokenizer = new Tokenizer()
                            .SetInputCol("content")
                            .SetOutputCol("words");

            var hashingTF = new HashingTF()
                            .SetInputCol("words")
                            .SetOutputCol("rawFeatures")
                            .SetNumFeatures(1000000);

            var idf = new IDF()
                      .SetInputCol("rawFeatures")
                      .SetOutputCol("features");

            var tokenizedDocuments  = tokenizer.Transform(documents);
            var featurizedDocuments = hashingTF.Transform(tokenizedDocuments);

            var idfModel = idf.Fit(featurizedDocuments);

            var transformedDocuments =
                idfModel.Transform(featurizedDocuments).Select("title", "features");
            var normalizedDocuments = transformedDocuments.Select(Col("features"),
                                                                  udfCalcNorm(transformedDocuments["features"]).Alias("norm"), Col("title"));

            var searchTerm = spark.CreateDataFrame(
                new List <GenericRow> {
                new GenericRow(new[] { search })
            },
                new StructType(new[] { new StructField("content", new StringType()) }));

            var tokenizedSearchTerm = tokenizer.Transform(searchTerm);

            var featurizedSearchTerm = hashingTF.Transform(tokenizedSearchTerm);

            var normalizedSearchTerm = idfModel
                                       .Transform(featurizedSearchTerm)
                                       .WithColumnRenamed("features", "searchTermFeatures")
                                       .WithColumn("searchTermNorm", udfCalcNorm(Column("searchTermFeatures")));

            var results = normalizedDocuments.CrossJoin(normalizedSearchTerm);

            results
            .WithColumn("similarity",
                        udfCosineSimilarity(Column("features"), Column("searchTermFeatures"),
                                            Col("norm"), Col("searchTermNorm")))
            .OrderBy(Desc("similarity")).Select("title", "similarity")
            .Show(10000, 100);
        }
Example #23
0
        public void TestMethodRecoverFile()
        {
            ResetLog();//重设日志

            if (Directory.Exists("../test/Target"))
            {
                FileInfo[] fis = new DirectoryInfo("../test/Target").GetFiles("*", SearchOption.AllDirectories);
                for (int i = 0; i < fis.Length; i++)
                {
                    if (i % 2 == 0)
                    {
                        File.Delete(fis[i].FullName);
                    }
                }
            }
            Thread.Sleep(500);

            //干扰文件项
            var ws = File.CreateText(new FileInfo("../Debug/TestFile.txt").FullName);

            ws.Write("123456789123456789");
            ws.Close();

            ws = File.CreateText(new FileInfo("../test/Target/TestFile.txt").FullName);
            ws.Write("12345");
            ws.Close();

            Runing.Increment.Log.Info("UnitTest1.TestMethodRecoverFile():生成xml文件");
            IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip");

            //记录备份前目标文件的md5值
            DirectoryInfo backupBeforeDir = new DirectoryInfo("../test/Target");

            FileInfo[] backupBeforeFiles = backupBeforeDir.GetFiles("*", SearchOption.AllDirectories);
            Dictionary <string, string> backupBeforeMD5 = new Dictionary <string, string>();

            for (int i = 0; i < backupBeforeFiles.Length; i++)
            {
                FileInfo file = backupBeforeFiles[i];
                backupBeforeMD5.Add(file.FullName, MD5Helper.FileMD5(file.FullName));
            }

            bool isDone = false;

            IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup")
            .OnMoveFileDone((obj, success) =>
            {
                string str = File.ReadAllText(new FileInfo("../test/Target/TestFile.txt").FullName);
                Assert.IsTrue(str == "123456789123456789");

                obj.RecoverFile();
                isDone = true;
            })
            .OnDownloadSuccess((obj) =>
            {
                //关闭那些程序
                obj.MoveFile();
            })
            .OnError((e) =>
            {
                isDone = true;
            }).Go();

            while (!isDone)
            {
                Thread.Sleep(50);
            }

            //记录备份后目标文件的md5值
            Runing.Increment.Log.Info($"UnitTest1.TestMethodRecoverFile(): 开始检查备份前后的文件...");
            DirectoryInfo backupAfterDir = new DirectoryInfo("../test/Target");

            FileInfo[] backupAfterFiles = backupAfterDir.GetFiles("*", SearchOption.AllDirectories);
            Assert.IsTrue(backupBeforeMD5.Count == backupAfterFiles.Length);

            for (int i = 0; i < backupAfterFiles.Length; i++)
            {
                FileInfo file = backupAfterFiles[i];
                Assert.IsTrue(backupBeforeMD5[file.FullName] == MD5Helper.FileMD5(file.FullName));
            }

            string str2 = File.ReadAllText(new FileInfo("../test/Target/TestFile.txt").FullName);

            Assert.IsTrue(str2 == "12345");
        }
Example #24
0
        public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent)
        {
            Trace.write(sent.fullText);
            double weight = 0;

            // 1: ScLead
            double sclead = 0;

            if (sent == doc.sentences[0])
            {
                sclead = 2;
            }
            else
            {
                sclead = 1;
            }

            Trace.write("SCLead : " + sclead);

            // 2: ScTitle
            double sctitle = 0;

            foreach (string aWord in sent.words)
            {
                //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal);
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                if (doc.title != null)
                {
                    if (doc.title.words.ToArray().Contains(aWord))
                    {
                        sctitle += (2 * tf);
                    }
                }
            }

            Trace.write("SCTitle : " + sctitle);

            // 3: sccue
            double sccue = 0;

            foreach (string aWord in sent.words)
            {
                if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord))
                {
                    double tf = termFrequency(sent, aWord);

                    sccue += tf;
                }
            }

            Trace.write("SCCue : " + sccue);

            // 4: sctfidf
            double sctfidf = 0;

            foreach (string aWord in sent.words)
            {
                //double tf = termFrequency(docStats, aWord);
                double tf = termFrequency(sent, aWord);

                //if (docStats.wordRefs[aWord] != null && tf != 0)
                if (tf != 0)
                {
                    //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count));
                    sctfidf += (((tf - 1) / tf) * idf.get(aWord));
                }
            }

            //sctfidf = sctfidf / docStats.sentCount;
            //sctfidf = sctfidf / doc.sentences.Count;
            //sctfidf = sctfidf / sent.words.Length;
            sctfidf = sctfidf / sent.words.Count;

            Trace.write("SCTFIDF : " + sctfidf);

            weight = sclead + sctitle + sccue + sctfidf;

            sent.weight = weight;

            Trace.write("Weight : " + weight);

            return(weight);
        }
Example #25
0
        private static void preprocessIDF(string docsFolder, string idfFile)
        {
            IDF idf = IDF.IDFGenerator.fromFiles(Directory.GetFiles(docsFolder, "*.txt", SearchOption.TopDirectoryOnly));

            idf.toFile(idfFile);
        }
Example #26
0
        static void Main(string[] args)
        {
            IDF idf = IDF.IDFGenerator.fromFiles(Directory.GetFiles(@"D:\Files\College\Advanced AI\Data Sets\CNNArabic2\Dest2\", "*.txt", SearchOption.TopDirectoryOnly));

            idf.toFile(@"IDF.txt");

            /*
             * Lemmatizer lemm = Lemmatizer.getInstance(Conf.LEMMATIZATION_WORDS_PATH);
             *
             * //new DocumentProcessor ().process ( File.ReadAllText ( "" , Encoding.Default ) ) ;
             * //new LexRankDegreeCentrality( 0.1 ).generateSummary ( ;
             *
             * // Training
             * string searchPath = Conf.TRAINING_PATH;
             *
             * string[] files = Directory.GetFiles(searchPath, "*", SearchOption.AllDirectories);
             * //string[] files = Directory.GetFiles(searchPath, "*", SearchOption.AllDirectories);
             *
             * ArrayList docs = new ArrayList();
             *
             * foreach (string file in files)
             * {
             *  Console.WriteLine("Processing file : " + file);
             *
             *  string currContent = File.ReadAllText(file, Encoding.Default);
             *
             *  //Document doc = Document.process(currContent);
             *  Document doc = Conf.getDocumentProcessor().process(currContent);
             *
             *  docs.Add(doc);
             *
             *  //break ;
             * }
             *
             * DocsStatistics stats = DocsStatistics.generateStatistics(docs);
             *
             * foreach (DictionaryEntry entry in stats.wordsCount)
             * {
             *  Trace.write(entry.Key + " : " + entry.Value + " Times in " + ((HashSet<Document>)stats.wordRefs[entry.Key]).Count + " Documents.");
             * }
             *
             * // Testing
             * searchPath = Conf.TESTING_PATH;
             *
             * files = Directory.GetFiles(searchPath, "*_AR.txt", SearchOption.AllDirectories);
             *
             * foreach (string file in files)
             * {
             *  /*
             *  //string file = @"D:\Files\College\Advanced AI\Data Sets\DataSet_Economics_4\Testing\06042008_7\1897283_1897234_AR.txt";
             *  string testFiletext = File.ReadAllText(file, Encoding.Default);
             *
             *  //string genSummary = DocsStatistics.generateSummary(stats, testFiletext);
             *  //string genSummary = new LakhasAlgorithm().generateSummary(stats, testFiletext);
             *  //string genSummary = new LexRankDegreeCentrality(0.1).generateSummary(stats, testFiletext);
             *  string genSummary = new LexRankWithThreshold(0.1, 0.15).generateSummary(stats, new DocumentProcessor().process(testFiletext));
             *
             *  string currDirectory = Directory.GetParent(file).FullName;
             *  //string filename = file.Remove(0, currDirectory.Length + 1);
             *  string filename = file.Remove(0, Conf.TESTING_PATH.Length + 1);
             *
             *  File.WriteAllText(currDirectory + "\\" + filename + "_SUMMARY.txt", genSummary, Encoding.Default);
             *  //* /
             * }
             * //*/
        }
Example #27
0
        // POST api/idf
        public void Post([FromBody] Computer value)
        {
            IDF idf = new IDF(value);

            idf.Run();
        }
Example #28
0
        /*
         * public string generateSummary(DocsStatistics docStats, string newDocText)
         * {
         *  Document newDoc = Conf.getDocumentProcessor().process(newDocText);
         *
         *  return (generateSummary(docStats, newDoc));
         * }
         * //*/

        //private static double DEGREE_CENTRALITY = 0.1;

        //override public string generateSummary(DocsStatistics docStats, Document newDoc)
        override public string generateSummary(ArrayList docs, double compressionRatio)
        {
            string genSummary = null;

            ArrayList allSents = new ArrayList();

            foreach (Document doc in docs)
            {
                allSents.AddRange(doc.sentences);
            }

            double[][] idfModifiedCosine = LexRankCommon.generateIdfModifiedCosineMatrix(IDF.getInstance(), allSents);

            Trace.write(" IDF Cosine Matrix : ");
            Trace.write(MatrixUtil.printMatrix(idfModifiedCosine));

            for (int i = 0; i < idfModifiedCosine.Length; i++)
            {
                int sentDegree = 0;

                for (int j = 0; j < idfModifiedCosine[i].Length; j++)
                {
                    if (idfModifiedCosine[i][j] > this.degreeCentrality)
                    {
                        ++sentDegree;
                    }
                }

                ((Sentence)allSents[i]).weight = sentDegree;
            }

            Sentence[] sents = (Sentence[])allSents.ToArray(typeof(Sentence));

            genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio);

            /*
             * Array.Sort(sents, new SentenceComparer());
             * Array.Reverse(sents);
             *
             * foreach (Sentence sent in sents)
             * {
             *  Trace.write(sent.fullText);
             *  Trace.write("Weight : " + sent.weight);
             * }
             *
             * genSummary = getText(sents);
             * //*/

            return(genSummary);
        }
Example #29
0
        public void TestMethodMoveFile2()
        {
            ResetLog();//重设日志

            Runing.Increment.Log.Info("UnitTest1.TestMethodMoveFile2():生成xml文件");
            IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip");

            //FileHelper.CleanDirectory("../test/Temp");
            //FileHelper.CleanDirectory("../test/Target");
            //FileHelper.CleanDirectory("../test/Backup");

            if (Directory.Exists("../test/Target"))
            {
                FileInfo[] fis = new DirectoryInfo("../test/Target").GetFiles("*.*", SearchOption.AllDirectories);
                for (int i = 0; i < fis.Length; i++)
                {
                    if (i % 2 == 0)
                    {
                        File.Delete(fis[i].FullName);
                    }
                }
            }

            Thread.Sleep(500);

            bool isDone = false;

            IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup")
            .OnMoveFileDone((obj, success) =>
            {
                Runing.Increment.Log.Info("移动文件成功后回调");
                isDone = true;
            })
            .OnDownloadSuccess((obj) =>
            {
                //关闭那些程序
                //异步的移动文件
                Task.Run(() => { obj.MoveFile(); });
            })
            .OnError((e) =>
            {
                isDone = true;
            }).Go();

            while (!isDone)
            {
                Thread.Sleep(50);
            }

            var xml = XmlHelper.CreatXml();

            var          fs    = File.Open(new FileInfo("../test/IDFTest.zip").FullName, FileMode.Open, FileAccess.Read);
            ZipFile      zip   = ZipFile.Read(fs);
            ZipEntry     ze    = zip.Entries.First();//第一个实体
            MemoryStream xmlms = new MemoryStream();

            ze.Extract(xmlms);
            xmlms.Position = 0;
            xml.Load(xmlms); //从下载文件流中读xml
            OriginFolder originFolder = new OriginFolder();
            var          node         = xml.DocumentElement.SelectSingleNode("./" + typeof(OriginFolder).Name);

            originFolder.FromXml(node);//从xml文件根节点反序列化
            fs.Close();

            int index = 0;

            foreach (var item in originFolder.fileItemDict.Values)
            {
                index++;
                Runing.Increment.Log.Info($"测试{index}:测试校验文件" + item.relativePath);
                string itemTarFilePath = Path.Combine(new DirectoryInfo("../test/Target").FullName, item.relativePath);
                Assert.IsTrue(MD5Helper.FileMD5(itemTarFilePath) == item.MD5);
            }
        }