public SparseVectorList GetFeatureVector(Dictionary <string, int> vec) { SparseVectorList featurevector = new SparseVectorList(); int lexiconindexcount = Lexicon.Count; foreach (var kvp in vec) { var word = kvp.Key; int value = 0; if (Lexicon == null || Lexicon.TryGetValue(word, out value) == false) { Lexicon.Add(word, lexiconindexcount); value = lexiconindexcount; lexiconindexcount++; } if (!featurevector.Increase(value, kvp.Value)) { featurevector.Insert(value, kvp.Value); } } featurevector.ListToArray(); featurevector.count = featurevector.keyarray.Length; if (featurevector.count < 1) { return(null); } featurevector.InvalidateList(); featurevector.GetNorm(); return(featurevector); }
public SparseVectorList GetFeatureVector(Document doc) { SparseVectorList featurevector = new SparseVectorList(); int lexiconindexcount = Lexicon.Count; var content = LuceneOperations.GetDocumentContent(doc, _fieldWeightDict, _leadingSentencesCnt); var words = NLPOperations.Tokenize(content, _tokenizeConfig); foreach (var word in words) { int value = 0; if (Lexicon == null || Lexicon.TryGetValue(word, out value) == false) { Lexicon.Add(word, lexiconindexcount); value = lexiconindexcount; lexiconindexcount++; } if (!featurevector.Increase(value, 1)) { featurevector.Insert(value, 1); } } featurevector.ListToArray(); featurevector.count = featurevector.keyarray.Length; //featurevector.SumUpValueArray(); if (featurevector.count < 1) { return(null); } featurevector.InvalidateList(); featurevector.GetNorm(); return(featurevector); }
private static int[] getMostFreqWordIndex(SparseVectorList featurevector, int k) { var sort = new HeapSortDouble(k); for (int iword = 0; iword < featurevector.keyarray.Length; iword++) { sort.Insert(featurevector.keyarray[iword], featurevector.valuearray[iword]); } return(sort.GetTopIndices().ToArray <int>()); }
public static double DotProduct(SparseVectorList featurevector1, SparseVectorList featurevector2) { int pt1 = 0; int pt2 = 0; int length1 = featurevector1.count; int length2 = featurevector2.count; double ret = 0; int[] keys1 = featurevector1.keyarray; int[] values1 = featurevector1.valuearray; int[] keys2 = featurevector2.keyarray; int[] values2 = featurevector2.valuearray; while (true) { while (pt1 < length1 && keys1[pt1] < keys2[pt2]) { pt1++; } if (pt1 == length1) { break; } if (keys1[pt1] == keys2[pt2]) { ret += (double)values1[pt1] * values2[pt2]; pt1++; pt2++; } else { while (pt2 < length2 && keys2[pt2] < keys1[pt1]) { pt2++; } if (pt2 == length2) { break; } if (keys2[pt2] == keys1[pt1]) { ret += (double)values1[pt1] * values2[pt2]; pt1++; pt2++; } } if (pt1 == length1 || pt2 == length2) { break; } } return(ret); }
public static double Cosine(SparseVectorList featurevector1, SparseVectorList featurevector2) { double cosine; if (featurevector1.count > featurevector2.count) { cosine = Cosine(featurevector2, featurevector1); } else { long t = DateTime.Now.Ticks; cosine = DotProduct(featurevector1, featurevector2) / featurevector1.normvalue / featurevector2.normvalue; } return(cosine); }
private List <int> RemoveSimilarDocumentsGranu(IndexReader reader, List <int> orgDocIDs, int timeWindowSize, int wordWindowSize) { var newDocIDs = new List <int>(); var removeSimilarity = Configure.MinDistinctiveDocumentCosine; Dictionary <int, Dictionary <int, List <SparseVectorList> > > uniqueDocHash = new Dictionary <int, Dictionary <int, List <SparseVectorList> > >(); int docNum = orgDocIDs.Count; int removeDocNum = 0; Dictionary <string, int> lexicon = new Dictionary <string, int>(); int timeslicesize = 1; if (timeWindowSize >= 15) { int[] dividePieceNumbers = new int[] { 3, 4, 5, 7 }; foreach (int dividePieceNumber in dividePieceNumbers) { if (timeWindowSize % dividePieceNumber == 0) { timeslicesize = timeWindowSize / dividePieceNumber; break; } } if (timeslicesize == 1) { timeslicesize = (timeWindowSize + 2) / 3; timeWindowSize = 3; } else { timeWindowSize /= timeslicesize; } Console.WriteLine("Reset window size! TimeSliceSize: {0}, WindowSize: {1}", timeslicesize, timeWindowSize); } int begintimedelta = -(timeWindowSize - 1) / 2; int endtimedelta = timeWindowSize / 2; var progress = new ProgramProgress(docNum); StreamWriter debugSw = null; if (Configure.IsDebug) { string fileName = Configure.OutputPath + "debug.txt"; FileOperations.EnsureFileFolderExist(fileName); debugSw = new StreamWriter(fileName, true, Encoding.UTF8); } foreach (var iDoc in orgDocIDs) { var doc = reader.Document(iDoc); SparseVectorList vector = GetFeatureVector(doc, lexicon); if (vector == null) { removeDocNum++; continue; } vector.documentid = iDoc; int time = getDateTimeBingNews(doc) / timeslicesize; int[] words = getMostFreqWordIndex(vector, wordWindowSize); bool bunqiue = true; for (int stime = time + begintimedelta; stime <= time + endtimedelta; stime++) { if (uniqueDocHash.ContainsKey(stime)) { Dictionary <int, List <SparseVectorList> > wordHash = uniqueDocHash[stime]; foreach (int sword in words) { if (wordHash.ContainsKey(sword)) { List <SparseVectorList> vectorList = wordHash[sword]; foreach (SparseVectorList svector in vectorList) { if (SparseVectorList.Cosine(svector, vector) >= removeSimilarity) { if (Configure.IsDebug && removeDocNum <= 10000) { double simi = SparseVectorList.Cosine(svector, vector); if (simi <= Configure.MaxShowDebugCosine) { debugSw.WriteLine("---------------------------------------------------"); debugSw.WriteLine(reader.Document(svector.documentid).Get(BingNewsFields.NewsArticleHeadline)); //Get("NewsArticleDescription")); debugSw.WriteLine(reader.Document(vector.documentid).Get(BingNewsFields.NewsArticleHeadline)); //Get("NewsArticleDescription")); debugSw.WriteLine(""); string body1 = reader.Document(svector.documentid).Get(BingNewsFields.NewsArticleDescription); string body2 = reader.Document(vector.documentid).Get(BingNewsFields.NewsArticleDescription); if (body1.Length > 100) { body1 = body1.Substring(0, 100); } if (body2.Length > 100) { body2 = body2.Substring(0, 100); } debugSw.WriteLine(body1); debugSw.WriteLine(body2); debugSw.WriteLine(simi); } debugSw.Flush(); } bunqiue = false; break; } } } if (!bunqiue) { break; } } } if (!bunqiue) { break; } } if (bunqiue) { int keytime = time; int keyword = words[0]; if (!uniqueDocHash.ContainsKey(keytime)) { uniqueDocHash.Add(keytime, new Dictionary <int, List <SparseVectorList> >()); } Dictionary <int, List <SparseVectorList> > wordHash = uniqueDocHash[keytime]; if (!wordHash.ContainsKey(keyword)) { wordHash.Add(keyword, new List <SparseVectorList>()); } List <SparseVectorList> list = wordHash[keyword]; list.Add(vector); newDocIDs.Add(iDoc); } else { removeDocNum++; } progress.PrintIncrementExperiment(); } Console.WriteLine("Finished remove similar documents. Removed {0} out of {1}", removeDocNum, docNum); int listLengthSum = 0, listCnt = 0; foreach (Dictionary <int, List <SparseVectorList> > hash0 in uniqueDocHash.Values) { foreach (List <SparseVectorList> list in hash0.Values) { listLengthSum += list.Count; listCnt++; } } Console.WriteLine("AvgListLength: {0}, ListCnt: {1}", listLengthSum / listCnt, listCnt); if (Configure.IsDebug) { debugSw.Flush(); debugSw.Close(); } return(newDocIDs); }