private void IDFPreprocessDocsDialog_FileOk(object sender, CancelEventArgs e) { string[] fileNames = this.IDFPreprocessDocsDialog.FileNames; /* * ArrayList docs = new ArrayList(); * * this.progressBar.Show(); * this.progressBar.Minimum = 0; * this.progressBar.Maximum = fileNames.Length + (fileNames.Length / 4); * this.progressBar.Value = 0; * * foreach (string fileName in fileNames) * { * string fileText = File.ReadAllText(fileName, Encoding.Default); * Document doc = docProcessor.process(fileText); * docs.Add(doc); * this.progressBar.Increment(1); * } * * this.idf = IDF.fromDocuments(docs); * * this.progressBar.Value = this.progressBar.Maximum; * this.progressBar.Hide(); * //*/ this.idf = IDF.IDFGenerator.fromFiles(fileNames); this.IDFSaveDialog.ShowDialog(); }
public void TestMethodDownload() { ResetLog();//重设日志 //SynchronizationContext contex = new SynchronizationContext(); //SynchronizationContext.SetSynchronizationContext(contex); //这样调用它没有跳回原来的线程 Runing.Increment.Log.Info("TestMethodDownload():当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); Console.WriteLine("UnitTestXML.GenerateXML():生成xml文件"); IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip"); //FileHelper.CleanDirectory("../test/Temp"); bool isDone = false; IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup") .OnDownloadSuccess((obj) => { Runing.Increment.Log.Info("TestMethodDownload():进入OnDownloadSuccess当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); isDone = true; }) .OnError((e) => { Runing.Increment.Log.Info("TestMethodDownload():进入OnError当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); Assert.Fail(); //下载错误,认为不通过 isDone = true; }).Go(); while (!isDone) { Thread.Sleep(15); } }
private void load(string clusterDir) { DocumentProcessor docProcessor = new DocumentProcessor(); ArrayList docs = new ArrayList(); string[] clusterFiles = Directory.GetFiles(clusterDir, "*.txt", SearchOption.TopDirectoryOnly); foreach (string filename in clusterFiles) { string fileText = File.ReadAllText(filename, Encoding.Default); Document doc = docProcessor.process(fileText); docs.Add(doc); } DocsStatistics docStats = DocsStatistics.generateStatistics(docs); Hashtable centroid = new Hashtable(); foreach (string word in docStats.wordsCount.Keys) { //centroid[firstWord] = (((int)docStats.wordsCount[firstWord]) * idf(docStats, firstWord)) / docs.Count; centroid[word] = (((int)docStats.wordsCount[word]) * IDF.getInstance().get(word)) / docs.Count; } this.centroidWords = applyKeepWords(centroid, this.keepWords); }
private void MainForm_Load(object sender, EventArgs e) { Trace.setInstance(new TextBoxTrace(this.TraceTxt)); //Trace.setInstance(new NullTrace()); IDF.setInstance(IDF.fromFile(Conf.IDFFILE_PATH)); this.AlgorithmCmbo.SelectedIndex = 0; }
public void TestMethodDownloadAsync() { ResetLog();//重设日志 bool isDone = false; bool isFail = false; //使用这个线程它也不会重新跳回原来的线程 Task.Run(() => { Runing.Increment.Log.Info("TestMethodDownload():当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); Console.WriteLine("UnitTestXML.GenerateXML():生成xml文件"); IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip"); //FileHelper.CleanDirectory("../test/Temp"); IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup") .OnDownloadSuccess((obj) => { Runing.Increment.Log.Info("TestMethodDownload():进入OnDownloadSuccess当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); isDone = true; }) .OnError((e) => { Runing.Increment.Log.Info("TestMethodDownload():进入OnError当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); isDone = true; isFail = true; }).Go(); }); while (!isDone) { Thread.Sleep(15); } Assert.IsFalse(isFail); }
private void button1_Click(object sender, EventArgs ea) { ResetLog();//重设日志 Runing.Increment.Log.Info("TestMethodDownload():当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); Console.WriteLine("UnitTestXML.GenerateXML():生成xml文件"); IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip"); //FileHelper.CleanDirectory("../test/Temp"); //bool isDone = false; IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup") .OnDownloadSuccess((obj) => { Runing.Increment.Log.Info("TestMethodDownload():进入OnDownloadSuccess当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); //isDone = true; }) .OnError((e) => { Runing.Increment.Log.Info("TestMethodDownload():进入OnError当前执行线程id=" + Thread.CurrentThread.ManagedThreadId); //isDone = true; }).Go(); }
public void TestIDFModel() { int expectedDocFrequency = 1980; string expectedInputCol = "rawFeatures"; string expectedOutputCol = "features"; DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); Tokenizer tokenizer = new Tokenizer() .SetInputCol("sentence") .SetOutputCol("words"); DataFrame wordsData = tokenizer.Transform(sentenceData); HashingTF hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol(expectedInputCol) .SetNumFeatures(20); DataFrame featurizedData = hashingTF.Transform(wordsData); IDF idf = new IDF() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); IDFModel idfModel = idf.Fit(featurizedData); DataFrame rescaledData = idfModel.Transform(featurizedData); Assert.Contains(expectedOutputCol, rescaledData.Columns()); Assert.Equal(expectedInputCol, idfModel.GetInputCol()); Assert.Equal(expectedOutputCol, idfModel.GetOutputCol()); Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); using (var tempDirectory = new TemporaryDirectory()) { string modelPath = Path.Join(tempDirectory.Path, "idfModel"); idfModel.Save(modelPath); IDFModel loadedModel = IDFModel.Load(modelPath); Assert.Equal(idfModel.Uid(), loadedModel.Uid()); } TestFeatureBase(idfModel, "minDocFreq", 1000); }
public void TestMethodNoServer() { ResetLog(false);//重设日志 bool isDone = false; Runing.Increment.Log.Info("UnitTest1.TestMethodNoServer(): http://127.0.0.1:11122"); IDF.Update("http://127.0.0.1:11122/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup") .OnMoveFileDone((obj, success) => { isDone = true; }) .OnDownloadSuccess((obj) => { obj.MoveFile(); }) .OnError((e) => { isDone = true; }) .Go(); while (!isDone) { Thread.Sleep(50); } isDone = false; Runing.Increment.Log.Info("UnitTest1.TestMethodNoServer(): http://baidu.com/IDFTest.zip"); IDF.Update("http://baidu.com/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup") .OnMoveFileDone((obj, success) => { isDone = true; }) .OnDownloadSuccess((obj) => { obj.MoveFile(); }) .OnError((e) => { isDone = true; }) .Go(); while (!isDone) { Thread.Sleep(50); } isDone = false; Runing.Increment.Log.Info("UnitTest1.TestMethodNoServer(): http://www.google.com"); IDF.Update("http://www.google.com/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup") .OnMoveFileDone((obj, success) => { isDone = true; }) .OnDownloadSuccess((obj) => { obj.MoveFile(); }) .OnError((e) => { isDone = true; }) .Go(); while (!isDone) { Thread.Sleep(50); } isDone = false; }
private static (IDFModel, DataFrame) GetModelAndNormalizedDataFrame(string sourceDir, Tokenizer tokenizer, HashingTF hashingTF) { var sourceDocuments = toDF(GetSourceFiles(sourceDir)); var words = tokenizer.Transform(sourceDocuments); var featurizedData = hashingTF.Transform(words); var idf = new IDF() .SetInputCol("rawFeatures") .SetOutputCol("features"); var idfModel = idf.Fit(featurizedData); var rescaled = idfModel.Transform(featurizedData); var filtered = rescaled.Select("Path", "features"); return(idfModel, filtered.WithColumn("norm", udfCalcNorm(Col("features")))); }
public static double idfModifiedCos(IDF idf, Sentence firstSentence, Sentence secondSentence) { double idfModifiedCosine = 0; HashSet <string> commonWords = new HashSet <string>(); foreach (string aWord in firstSentence.words) { if (secondSentence.words.Contains(aWord)) { commonWords.Add(aWord); } } double numerator = 0; foreach (string aWord in commonWords) { numerator += (termFrequency(firstSentence, aWord) * termFrequency(secondSentence, aWord) * Math.Pow(idf.get(aWord), 2)); } double denominator1 = 0; foreach (string aWord in firstSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow(termFrequency(firstSentence, aWord) * idf.get(aWord), 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in secondSentence.words) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow(termFrequency(secondSentence, aWord) * idf.get(aWord), 2); } denominator2 = Math.Sqrt(denominator2); idfModifiedCosine = numerator / (denominator1 * denominator2); return(idfModifiedCosine); }
override public string generateSummary(Document newDoc, double compressionRatio) { //Document newDoc = Document.process(newDocText); //Document newDoc = Conf.getDocumentProcessor().process(newDocText); foreach (Sentence aSent in newDoc.sentences) { calcSentenceWeight(IDF.getInstance(), newDoc, aSent); } //object[] sents = newDoc.sentences.ToArray(); Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(typeof(Sentence)); string genSummary = ""; genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio); /* * Array.Sort(sents, new SentenceComparer()); * Array.Reverse(sents); * * * * int numSents = NUM_SENTENCES; * if (sents.Length < numSents) * numSents = sents.Length; * * for (int i = 0; i < numSents; i++) * { * genSummary += ((Sentence)sents[i]).fullText + "\r\n"; * } * //*/ /* * string dbgString = ""; * foreach (Sentence aSent in sents) * { * dbgString += aSent.fullText + "\r\n"; * } * * debugClipboard(dbgString); * //*/ return(genSummary); }
public void extract_trigger_words(string[] contexts, int m) { int N = contexts.Length; List<IDF> idfs = new List<IDF>(); foreach (string context in contexts.ToList()) { string[] tokens = context.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); foreach (string token in tokens.ToList().Where(t => t != "-TITLE-").Distinct()) { if (idfs.Where(i => i.Name == token).Count() > 0) { var idf = idfs.Where(i => i.Name == token).First(); idf.Counted++; } else { var idf = new IDF() { Name = token, Counted = 1, Value = 0 }; idfs.Add(idf); } } } foreach (var idf in idfs) { double value = Math.Log(N / idf.Counted); idf.Value = value; } foreach (var idf in idfs.OrderByDescending(i => i.Value).ThenBy(i => i.Name, StringComparer.Ordinal).Take(m)) { System.Console.WriteLine(idf.Name); } }
public void IDFTest() { var doc1 = new Document(new List <string>() { "АААА", "АААА", "ББББ", "ЗАС" }); var doc2 = new Document(new List <string>() { "АААА", "АААА", "ББББ", "ББББ", "ББББ" }); var documents = new Dictionary <string, Document> { ["1"] = doc1, ["2"] = doc2 }; IDF idfEvaluator = new IDF(documents); Assert.AreEqual(Math.Log10((double)documents.Count / 2), idfEvaluator.Evaluate(doc1[0]), eps); Assert.AreEqual(Math.Log10((double)documents.Count / 2), idfEvaluator.Evaluate(doc1[2]), eps); Assert.AreEqual(Math.Log10((double)documents.Count / 1), idfEvaluator.Evaluate(doc1[3]), eps); }
public static double[][] generateIdfModifiedCosineMatrix(IDF idf, ArrayList sentences) { double[][] idfModifiedCosine = new double[sentences.Count][]; for (int i = 0; i < sentences.Count; i++) { idfModifiedCosine[i] = new double[sentences.Count]; } for (int i = 0; i < sentences.Count; i++) { Sentence firstSent = (Sentence)sentences[i]; for (int j = 0; j < sentences.Count; j++) { // same sentence then 1 //* if (i == j) { idfModifiedCosine[i][j] = 1; continue; } //*/ // has been processed before if (idfModifiedCosine[i][j] != 0) { continue; } Sentence secondSent = (Sentence)sentences[j]; idfModifiedCosine[i][j] = idfModifiedCos(idf, firstSent, secondSent); idfModifiedCosine[j][i] = idfModifiedCosine[i][j]; } } return(idfModifiedCosine); }
public static double sim(IDF idf, Hashtable first, Hashtable second) { double similarity = 0; HashSet <string> commonWords = SummaryUtil.getCommonWords(new ArrayList(first.Keys), new ArrayList(second.Keys)); double numerator = 0; foreach (string aWord in commonWords) { numerator += ((double)first[aWord] * (double)second[aWord] * idf.get(aWord)); } double denominator1 = 0; foreach (string aWord in first.Keys) { //if (docStats.wordRefs[aWord] != null) denominator1 += Math.Pow((double)first[aWord], 2); } denominator1 = Math.Sqrt(denominator1); double denominator2 = 0; foreach (string aWord in second.Keys) { //if (docStats.wordRefs[aWord] != null) denominator2 += Math.Pow((double)second[aWord], 2); } denominator2 = Math.Sqrt(denominator2); similarity = numerator / (denominator1 * denominator2); return(similarity); }
public void TestIDFModel() { string expectedInputCol = "rawFeatures"; string expectedOutputCol = "features"; int expectedDocFrequency = 100; IDF idf = new IDF() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); Assert.Equal(expectedInputCol, idf.GetInputCol()); Assert.Equal(expectedOutputCol, idf.GetOutputCol()); Assert.Equal(expectedDocFrequency, idf.GetMinDocFreq()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "IDF"); idf.Save(savePath); IDF loadedIdf = IDF.Load(savePath); Assert.Equal(idf.Uid(), loadedIdf.Uid()); } }
override public string generateSummary(Document newDoc, double compressionRatio) { double[] cTotal = new double[newDoc.sentences.Count]; double[] pTotal = new double[newDoc.sentences.Count]; double[] fTotal = new double[newDoc.sentences.Count]; double cMax = double.MinValue; ArrayList centroids = buildCentroids(this.trainingDocs, IDF.getInstance()); for (int i = 0; i < newDoc.sentences.Count; i++) { Sentence currSent = (Sentence)newDoc.sentences[i]; // Calculate C cTotal[i] = 0; foreach (string word in currSent.words) { /* * double tf = termFrequency(docStats, firstWord); * double idf = CentroidAlgorithm.idf(docStats, firstWord); * cTotal[i] += tf * idf; * //*/ cTotal[i] += getCentroidValue(centroids, word); } if (cTotal[i] > cMax) { cMax = cTotal[i]; } // Calculate F fTotal[i] = 0; foreach (string word in currSent.words) { int wordOccurence = 0; if (newDoc.title.wordsCount[word] != null) { wordOccurence += ((int)newDoc.title.wordsCount[word]); } if (newDoc.sentences.Count > 1) { if (((Sentence)newDoc.sentences[0]).wordsCount[word] != null) { wordOccurence += ((int)((Sentence)newDoc.sentences[0]).wordsCount[word]); } } fTotal[i] += (wordOccurence * ((int)currSent.wordsCount[word])); } } // Calculate P for (int i = 0; i < newDoc.sentences.Count; i++) { // Remove + 1 as arrays are zero based. pTotal[i] = ((newDoc.sentences.Count - i) * cMax) / newDoc.sentences.Count; } double maxScore = double.MinValue; for (int i = 0; i < newDoc.sentences.Count; i++) { double currWeight = (this.centroidWeight * cTotal[i]) + (this.positionalWeight * pTotal[i]) + (this.firstSentenceWeight * fTotal[i]); ((Sentence)newDoc.sentences[i]).weight = currWeight; if (currWeight > maxScore) { maxScore = currWeight; } } string genSummary = null; string prevgenSummary = null; do { for (int i = 0; i < newDoc.sentences.Count; i++) { for (int j = 0; j < newDoc.sentences.Count; j++) { if (i >= j) { continue; } double redundancy = redundancyPenalty((Sentence)newDoc.sentences[i], (Sentence)newDoc.sentences[j]); ((Sentence)newDoc.sentences[j]).weight -= (maxScore * redundancy); } } maxScore = double.MinValue; for (int i = 0; i < newDoc.sentences.Count; i++) { if (((Sentence)newDoc.sentences[i]).weight > maxScore) { maxScore = ((Sentence)newDoc.sentences[i]).weight; } } Sentence[] sents = (Sentence[])newDoc.sentences.ToArray(new Sentence().GetType()); prevgenSummary = genSummary; genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, this.compressionRatio); } while (!genSummary.Equals(prevgenSummary)); return(genSummary); }
//override public string generateSummary(DocsStatistics docStats, Document newDoc) override public string generateSummary(ArrayList docs, double compressionRatio) { string genSummary = ""; ArrayList allSents = new ArrayList(); foreach (Document doc in docs) { allSents.AddRange(doc.sentences); } double[][] idfModifiedCosineMatrix = LexRankCommon.generateIdfModifiedCosineMatrix(IDF.getInstance(), allSents); //* Trace.write(" IDF Cosine Matrix : "); Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix)); //*/ double[] sentDegree = new double[allSents.Count]; for (int i = 0; i < sentDegree.Length; i++) { sentDegree[i] = 0; } for (int i = 0; i < idfModifiedCosineMatrix.Length; i++) { for (int j = 0; j < idfModifiedCosineMatrix[i].Length; j++) { /* * if (i == j) * continue; * //*/ if (idfModifiedCosineMatrix[i][j] > this.threshold) { idfModifiedCosineMatrix[i][j] = 1; sentDegree[i]++; } else { idfModifiedCosineMatrix[i][j] = 0; } } } Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix)); for (int i = 0; i < idfModifiedCosineMatrix.Length; i++) { for (int j = 0; j < idfModifiedCosineMatrix[i].Length; j++) { idfModifiedCosineMatrix[i][j] = idfModifiedCosineMatrix[i][j] / sentDegree[i]; idfModifiedCosineMatrix[i][j] = (dampingFactor / idfModifiedCosineMatrix.Length) + ((1 - dampingFactor) * idfModifiedCosineMatrix[i][j]); } } Trace.write(MatrixUtil.printMatrix(idfModifiedCosineMatrix)); double[] weights = LexRankCommon.powerMethod(idfModifiedCosineMatrix, 0.1); for (int i = 0; i < allSents.Count; i++) { ((Sentence)allSents[i]).weight = weights[i]; } Sentence[] sents = (Sentence[])allSents.ToArray(new Sentence().GetType()); genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio); /* * Array.Sort(sents, new SentenceComparer()); * Array.Reverse(sents); * * foreach (Sentence sent in sents) * { * Trace.write(sent.fullText); * Trace.write("Weight : " + sent.weight); * } * * genSummary = getText(sents); * //*/ return(genSummary); }
//load venues into combobox private void Form2_Load(object sender, EventArgs e) { string venueQuery = "SELECT Venue.VenueID, Venue.VenueName FROM [Venue];"; string venueSpaceQuery = "SELECT VenueSpace.VenueSpaceID, VenueSpace.VenueSpaceName, VenueSpace.VenueID FROM [VenueSpace];"; string idfQuery = "SELECT IDF.IDFID, IDF.IDFName, IDF.VenueSpaceID FROM [IDF];"; string ppQuery = "SELECT PatchPanel.PatchPanelID, PatchPanel.PatchPanelName, PatchPanel.IDFID FROM [PatchPanel];"; string switchQuery = "SELECT Switch.SwitchID, Switch.DNSName, Switch.IDFID FROM [Switch];"; string ppPortQuery = "SELECT PatchPanelPort.PatchPanelPortID, PatchPanelPort.PatchPanelPortNum, PatchPanelPort.PatchPanelID FROM [PatchPanelPort] WHERE PatchPanelPort.SwitchPortID IS NULL;"; string switchPortQuery = "SELECT SwitchPort.SwitchPortID, SwitchPort.SwitchPortNum, SwitchPort.SwitchID FROM [SwitchPort] WHERE SwitchPort.PatchPanelPortID IS NULL;"; DataSet venDS = getDataSet(venueQuery); DataSet vsDS = getDataSet(venueSpaceQuery); DataSet idfDS = getDataSet(idfQuery); DataSet swDS = getDataSet(switchQuery); DataSet ppDS = getDataSet(ppQuery); DataSet swpDS = getDataSet(switchPortQuery); DataSet pppDS = getDataSet(ppPortQuery); try { foreach (DataRow dr in venDS.Tables[0].Rows) { Venue venue = new Venue(); venue.venueID = (int)dr.ItemArray.GetValue(0); venue.venueName = dr.ItemArray.GetValue(1).ToString(); venueList.Add(venue); } foreach (DataRow dr in vsDS.Tables[0].Rows) { VenueSpace venueSpace = new VenueSpace(); venueSpace.venueSpaceID = (int)dr.ItemArray.GetValue(0); venueSpace.venueSpaceName = dr.ItemArray.GetValue(1).ToString(); venueSpace.venueID = (int)dr.ItemArray.GetValue(2); venueSpaceList.Add(venueSpace); } foreach (DataRow dr in idfDS.Tables[0].Rows) { IDF idf = new IDF(); idf.idfID = (int)dr.ItemArray.GetValue(0); idf.idfName = dr.ItemArray.GetValue(1).ToString(); idf.venueSpaceID = (int)dr.ItemArray.GetValue(2); idfList.Add(idf); } foreach (DataRow dr in swDS.Tables[0].Rows) { Switch sw = new Switch(); sw.switchID = (int)dr.ItemArray.GetValue(0); sw.switchNameDNS = dr.ItemArray.GetValue(1).ToString(); sw.idfID = (int)dr.ItemArray.GetValue(2); switchList.Add(sw); } foreach (DataRow dr in ppDS.Tables[0].Rows) { PatchPanel pp = new PatchPanel(); pp.patchPanelID = (int)dr.ItemArray.GetValue(0); pp.patchPanelName = dr.ItemArray.GetValue(1).ToString(); pp.idfID = (int)dr.ItemArray.GetValue(2); PPList.Add(pp); } foreach (DataRow dr in swpDS.Tables[0].Rows) { SwitchPort swp = new SwitchPort(); swp.switchPortID = (int)dr.ItemArray.GetValue(0); swp.switchPortNum = (int)dr.ItemArray.GetValue(1); swp.switchID = (int)dr.ItemArray.GetValue(2); switchPortList.Add(swp); } foreach (DataRow dr in pppDS.Tables[0].Rows) { PatchPanelPort ppp = new PatchPanelPort(); ppp.patchPanelPortID = (int)dr.ItemArray.GetValue(0); ppp.patchPanelPortNum = (int)dr.ItemArray.GetValue(1); ppp.patchPanelID = (int)dr.ItemArray.GetValue(2); PPPList.Add(ppp); } } catch (OleDbException exp) { MessageBox.Show("Database Error:" + exp.Message.ToString()); } updateVenueBoxList(venueList); updateDataGridView(); }
private List<IDF> getIDFList() { List<IDF> idfs = new List<IDF>(); string idfQuery = "SELECT IDF.IDFID, IDF.IDFName, VenueSpace.VenueSpaceName, Venue.VenueName FROM [IDF], [VenueSpace], [Venue] WHERE IDF.VenueSpaceID = VenueSpace.VenueSpaceID AND VenueSpace.VenueID = Venue.VenueID;"; DataSet idfDS = getDataSet(idfQuery); foreach (DataRow dr in idfDS.Tables[0].Rows) { IDF idf = new IDF(); idf.idfString = dr.ItemArray.GetValue(0).ToString(); idf.name = dr.ItemArray.GetValue(1).ToString(); idf.venueSpaceName = dr.ItemArray.GetValue(2).ToString(); idf.venueName = dr.ItemArray.GetValue(3).ToString(); idf.idfString = idf.name + ", " + idf.venueSpaceName + ", " + idf.venueName; idfs.Add(idf); } idfs.OrderBy(i => i.venueName); return idfs; }
public ArrayList buildCentroids(ArrayList docs, IDF idfdb) { ArrayList centroids = new ArrayList(); foreach (Document doc in docs) { ArrayList currDoc = new ArrayList(); currDoc.Add(doc); DocsStatistics currDocStats = DocsStatistics.generateStatistics(currDoc); Hashtable docVector = new Hashtable(); foreach (DictionaryEntry entry in currDocStats.wordsCount) { string word = (string)entry.Key; int count = (int)entry.Value; //double idf = CentroidAlgorithm2.idf(allDocStats, firstWord); double idf = idfdb.get(word); if (idf < this.idfThreshold) { continue; } double tfidf = ((double)count) * idf; docVector[word] = tfidf; } if (centroids.Count == 0) { Centroid centroid = new Centroid(docVector, this.keepWords); centroid.noOfDocuments = 1; centroids.Add(centroid); } else { Centroid nearestCentroid = null; double maxSimilarity = double.MinValue; foreach (Centroid centroid in centroids) { double similarity = sim(IDF.getInstance(), centroid.values, docVector); if (similarity > simThreshold) { if (similarity > maxSimilarity) { maxSimilarity = similarity; nearestCentroid = centroid; } } } if (nearestCentroid == null) { nearestCentroid = new Centroid(docVector, this.keepWords); centroids.Add(nearestCentroid); } else { nearestCentroid.addDocument(docVector); } } } // Apply the KEEP_WORDS parameter for each centroid /* * foreach (Centroid centroid in centroids) * { * Hashtable centroidValues = centroid.values; * * DictionaryEntry[] centValuesArr = new DictionaryEntry[centroids.Count]; * * centroidValues.CopyTo(centValuesArr, 0); * * Array.Sort(centValuesArr, new DictionaryEntryValueComparer()); * Array.Reverse(centValuesArr); * * DictionaryEntry[] finalCentroidValuesArr = new DictionaryEntry[this.keepWords]; * * Array.Copy(centValuesArr, finalCentroidValuesArr, this.keepWords); * * Hashtable finalCentroidValues = new Hashtable(); * * foreach (DictionaryEntry entry in finalCentroidValuesArr) * { * finalCentroidValues.Add(entry.Key, entry.Value); * } * * centroid.values = finalCentroidValues; * } * //*/ //* foreach (Centroid centroid in centroids) { centroid.applyKeepWords(); } //*/ // Trace /* * int i = 0; * foreach (Centroid centroid in centroids) * { * Trace.write("Centroid #" + (++i)); * foreach (DictionaryEntry entry in centroid.values) * { * Trace.write(entry.Key + " : " + entry.Value); * } * } * //*/ return(centroids); }
private static void Main(string[] args) { var spark = SparkSession .Builder() .AppName("TF-IDF Application") .GetOrCreate(); var documentPath = args[0]; var search = args[1]; var documentData = GetDocuments(documentPath); var documents = spark.CreateDataFrame(documentData, new StructType( new List <StructField> { new StructField("title", new StringType()), new StructField("content", new StringType()) })); var tokenizer = new Tokenizer() .SetInputCol("content") .SetOutputCol("words"); var hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol("rawFeatures") .SetNumFeatures(1000000); var idf = new IDF() .SetInputCol("rawFeatures") .SetOutputCol("features"); var tokenizedDocuments = tokenizer.Transform(documents); var featurizedDocuments = hashingTF.Transform(tokenizedDocuments); var idfModel = idf.Fit(featurizedDocuments); var transformedDocuments = idfModel.Transform(featurizedDocuments).Select("title", "features"); var normalizedDocuments = transformedDocuments.Select(Col("features"), udfCalcNorm(transformedDocuments["features"]).Alias("norm"), Col("title")); var searchTerm = spark.CreateDataFrame( new List <GenericRow> { new GenericRow(new[] { search }) }, new StructType(new[] { new StructField("content", new StringType()) })); var tokenizedSearchTerm = tokenizer.Transform(searchTerm); var featurizedSearchTerm = hashingTF.Transform(tokenizedSearchTerm); var normalizedSearchTerm = idfModel .Transform(featurizedSearchTerm) .WithColumnRenamed("features", "searchTermFeatures") .WithColumn("searchTermNorm", udfCalcNorm(Column("searchTermFeatures"))); var results = normalizedDocuments.CrossJoin(normalizedSearchTerm); results .WithColumn("similarity", udfCosineSimilarity(Column("features"), Column("searchTermFeatures"), Col("norm"), Col("searchTermNorm"))) .OrderBy(Desc("similarity")).Select("title", "similarity") .Show(10000, 100); }
public void TestMethodRecoverFile() { ResetLog();//重设日志 if (Directory.Exists("../test/Target")) { FileInfo[] fis = new DirectoryInfo("../test/Target").GetFiles("*", SearchOption.AllDirectories); for (int i = 0; i < fis.Length; i++) { if (i % 2 == 0) { File.Delete(fis[i].FullName); } } } Thread.Sleep(500); //干扰文件项 var ws = File.CreateText(new FileInfo("../Debug/TestFile.txt").FullName); ws.Write("123456789123456789"); ws.Close(); ws = File.CreateText(new FileInfo("../test/Target/TestFile.txt").FullName); ws.Write("12345"); ws.Close(); Runing.Increment.Log.Info("UnitTest1.TestMethodRecoverFile():生成xml文件"); IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip"); //记录备份前目标文件的md5值 DirectoryInfo backupBeforeDir = new DirectoryInfo("../test/Target"); FileInfo[] backupBeforeFiles = backupBeforeDir.GetFiles("*", SearchOption.AllDirectories); Dictionary <string, string> backupBeforeMD5 = new Dictionary <string, string>(); for (int i = 0; i < backupBeforeFiles.Length; i++) { FileInfo file = backupBeforeFiles[i]; backupBeforeMD5.Add(file.FullName, MD5Helper.FileMD5(file.FullName)); } bool isDone = false; IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup") .OnMoveFileDone((obj, success) => { string str = File.ReadAllText(new FileInfo("../test/Target/TestFile.txt").FullName); Assert.IsTrue(str == "123456789123456789"); obj.RecoverFile(); isDone = true; }) .OnDownloadSuccess((obj) => { //关闭那些程序 obj.MoveFile(); }) .OnError((e) => { isDone = true; }).Go(); while (!isDone) { Thread.Sleep(50); } //记录备份后目标文件的md5值 Runing.Increment.Log.Info($"UnitTest1.TestMethodRecoverFile(): 开始检查备份前后的文件..."); DirectoryInfo backupAfterDir = new DirectoryInfo("../test/Target"); FileInfo[] backupAfterFiles = backupAfterDir.GetFiles("*", SearchOption.AllDirectories); Assert.IsTrue(backupBeforeMD5.Count == backupAfterFiles.Length); for (int i = 0; i < backupAfterFiles.Length; i++) { FileInfo file = backupAfterFiles[i]; Assert.IsTrue(backupBeforeMD5[file.FullName] == MD5Helper.FileMD5(file.FullName)); } string str2 = File.ReadAllText(new FileInfo("../test/Target/TestFile.txt").FullName); Assert.IsTrue(str2 == "12345"); }
public static double calcSentenceWeight(IDF idf, Document doc, Sentence sent) { Trace.write(sent.fullText); double weight = 0; // 1: ScLead double sclead = 0; if (sent == doc.sentences[0]) { sclead = 2; } else { sclead = 1; } Trace.write("SCLead : " + sclead); // 2: ScTitle double sctitle = 0; foreach (string aWord in sent.words) { //double tf = docStats.wordsCount[aWord] == null ? 0 : (((int)docStats.wordsCount[aWord]) / docStats.wordTotal); //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); if (doc.title != null) { if (doc.title.words.ToArray().Contains(aWord)) { sctitle += (2 * tf); } } } Trace.write("SCTitle : " + sctitle); // 3: sccue double sccue = 0; foreach (string aWord in sent.words) { if (CueWords.getInstance(Conf.CUE_WORDS_PATH).contains(aWord)) { double tf = termFrequency(sent, aWord); sccue += tf; } } Trace.write("SCCue : " + sccue); // 4: sctfidf double sctfidf = 0; foreach (string aWord in sent.words) { //double tf = termFrequency(docStats, aWord); double tf = termFrequency(sent, aWord); //if (docStats.wordRefs[aWord] != null && tf != 0) if (tf != 0) { //sctfidf += (((tf - 1) / tf) * Math.Log(docStats.docCount / ((HashSet<Document>)docStats.wordRefs[aWord]).Count)); sctfidf += (((tf - 1) / tf) * idf.get(aWord)); } } //sctfidf = sctfidf / docStats.sentCount; //sctfidf = sctfidf / doc.sentences.Count; //sctfidf = sctfidf / sent.words.Length; sctfidf = sctfidf / sent.words.Count; Trace.write("SCTFIDF : " + sctfidf); weight = sclead + sctitle + sccue + sctfidf; sent.weight = weight; Trace.write("Weight : " + weight); return(weight); }
private static void preprocessIDF(string docsFolder, string idfFile) { IDF idf = IDF.IDFGenerator.fromFiles(Directory.GetFiles(docsFolder, "*.txt", SearchOption.TopDirectoryOnly)); idf.toFile(idfFile); }
static void Main(string[] args) { IDF idf = IDF.IDFGenerator.fromFiles(Directory.GetFiles(@"D:\Files\College\Advanced AI\Data Sets\CNNArabic2\Dest2\", "*.txt", SearchOption.TopDirectoryOnly)); idf.toFile(@"IDF.txt"); /* * Lemmatizer lemm = Lemmatizer.getInstance(Conf.LEMMATIZATION_WORDS_PATH); * * //new DocumentProcessor ().process ( File.ReadAllText ( "" , Encoding.Default ) ) ; * //new LexRankDegreeCentrality( 0.1 ).generateSummary ( ; * * // Training * string searchPath = Conf.TRAINING_PATH; * * string[] files = Directory.GetFiles(searchPath, "*", SearchOption.AllDirectories); * //string[] files = Directory.GetFiles(searchPath, "*", SearchOption.AllDirectories); * * ArrayList docs = new ArrayList(); * * foreach (string file in files) * { * Console.WriteLine("Processing file : " + file); * * string currContent = File.ReadAllText(file, Encoding.Default); * * //Document doc = Document.process(currContent); * Document doc = Conf.getDocumentProcessor().process(currContent); * * docs.Add(doc); * * //break ; * } * * DocsStatistics stats = DocsStatistics.generateStatistics(docs); * * foreach (DictionaryEntry entry in stats.wordsCount) * { * Trace.write(entry.Key + " : " + entry.Value + " Times in " + ((HashSet<Document>)stats.wordRefs[entry.Key]).Count + " Documents."); * } * * // Testing * searchPath = Conf.TESTING_PATH; * * files = Directory.GetFiles(searchPath, "*_AR.txt", SearchOption.AllDirectories); * * foreach (string file in files) * { * /* * //string file = @"D:\Files\College\Advanced AI\Data Sets\DataSet_Economics_4\Testing\06042008_7\1897283_1897234_AR.txt"; * string testFiletext = File.ReadAllText(file, Encoding.Default); * * //string genSummary = DocsStatistics.generateSummary(stats, testFiletext); * //string genSummary = new LakhasAlgorithm().generateSummary(stats, testFiletext); * //string genSummary = new LexRankDegreeCentrality(0.1).generateSummary(stats, testFiletext); * string genSummary = new LexRankWithThreshold(0.1, 0.15).generateSummary(stats, new DocumentProcessor().process(testFiletext)); * * string currDirectory = Directory.GetParent(file).FullName; * //string filename = file.Remove(0, currDirectory.Length + 1); * string filename = file.Remove(0, Conf.TESTING_PATH.Length + 1); * * File.WriteAllText(currDirectory + "\\" + filename + "_SUMMARY.txt", genSummary, Encoding.Default); * //* / * } * //*/ }
// POST api/idf public void Post([FromBody] Computer value) { IDF idf = new IDF(value); idf.Run(); }
/* * public string generateSummary(DocsStatistics docStats, string newDocText) * { * Document newDoc = Conf.getDocumentProcessor().process(newDocText); * * return (generateSummary(docStats, newDoc)); * } * //*/ //private static double DEGREE_CENTRALITY = 0.1; //override public string generateSummary(DocsStatistics docStats, Document newDoc) override public string generateSummary(ArrayList docs, double compressionRatio) { string genSummary = null; ArrayList allSents = new ArrayList(); foreach (Document doc in docs) { allSents.AddRange(doc.sentences); } double[][] idfModifiedCosine = LexRankCommon.generateIdfModifiedCosineMatrix(IDF.getInstance(), allSents); Trace.write(" IDF Cosine Matrix : "); Trace.write(MatrixUtil.printMatrix(idfModifiedCosine)); for (int i = 0; i < idfModifiedCosine.Length; i++) { int sentDegree = 0; for (int j = 0; j < idfModifiedCosine[i].Length; j++) { if (idfModifiedCosine[i][j] > this.degreeCentrality) { ++sentDegree; } } ((Sentence)allSents[i]).weight = sentDegree; } Sentence[] sents = (Sentence[])allSents.ToArray(typeof(Sentence)); genSummary = SummaryUtil.SummarizeByCompressionRatio(sents, compressionRatio); /* * Array.Sort(sents, new SentenceComparer()); * Array.Reverse(sents); * * foreach (Sentence sent in sents) * { * Trace.write(sent.fullText); * Trace.write("Weight : " + sent.weight); * } * * genSummary = getText(sents); * //*/ return(genSummary); }
public void TestMethodMoveFile2() { ResetLog();//重设日志 Runing.Increment.Log.Info("UnitTest1.TestMethodMoveFile2():生成xml文件"); IDFHelper.CreatConfigFileWithXml("../Debug/", "http://127.0.0.1:22333/Debug/", "../test/IDFTest.zip"); //FileHelper.CleanDirectory("../test/Temp"); //FileHelper.CleanDirectory("../test/Target"); //FileHelper.CleanDirectory("../test/Backup"); if (Directory.Exists("../test/Target")) { FileInfo[] fis = new DirectoryInfo("../test/Target").GetFiles("*.*", SearchOption.AllDirectories); for (int i = 0; i < fis.Length; i++) { if (i % 2 == 0) { File.Delete(fis[i].FullName); } } } Thread.Sleep(500); bool isDone = false; IDF.Update("http://127.0.0.1:22333/test/IDFTest.zip", "../test/Temp", "../test/Target", "../test/Backup") .OnMoveFileDone((obj, success) => { Runing.Increment.Log.Info("移动文件成功后回调"); isDone = true; }) .OnDownloadSuccess((obj) => { //关闭那些程序 //异步的移动文件 Task.Run(() => { obj.MoveFile(); }); }) .OnError((e) => { isDone = true; }).Go(); while (!isDone) { Thread.Sleep(50); } var xml = XmlHelper.CreatXml(); var fs = File.Open(new FileInfo("../test/IDFTest.zip").FullName, FileMode.Open, FileAccess.Read); ZipFile zip = ZipFile.Read(fs); ZipEntry ze = zip.Entries.First();//第一个实体 MemoryStream xmlms = new MemoryStream(); ze.Extract(xmlms); xmlms.Position = 0; xml.Load(xmlms); //从下载文件流中读xml OriginFolder originFolder = new OriginFolder(); var node = xml.DocumentElement.SelectSingleNode("./" + typeof(OriginFolder).Name); originFolder.FromXml(node);//从xml文件根节点反序列化 fs.Close(); int index = 0; foreach (var item in originFolder.fileItemDict.Values) { index++; Runing.Increment.Log.Info($"测试{index}:测试校验文件" + item.relativePath); string itemTarFilePath = Path.Combine(new DirectoryInfo("../test/Target").FullName, item.relativePath); Assert.IsTrue(MD5Helper.FileMD5(itemTarFilePath) == item.MD5); } }