Пример #1
0
        public SparseVectorList GetFeatureVector(Dictionary <string, int> vec)
        {
            SparseVectorList featurevector = new SparseVectorList();

            int lexiconindexcount = Lexicon.Count;

            foreach (var kvp in vec)
            {
                var word  = kvp.Key;
                int value = 0;
                if (Lexicon == null || Lexicon.TryGetValue(word, out value) == false)
                {
                    Lexicon.Add(word, lexiconindexcount);
                    value = lexiconindexcount;
                    lexiconindexcount++;
                }
                if (!featurevector.Increase(value, kvp.Value))
                {
                    featurevector.Insert(value, kvp.Value);
                }
            }

            featurevector.ListToArray();
            featurevector.count = featurevector.keyarray.Length;
            if (featurevector.count < 1)
            {
                return(null);
            }
            featurevector.InvalidateList();
            featurevector.GetNorm();
            return(featurevector);
        }
Пример #2
0
        public SparseVectorList GetFeatureVector(Document doc)
        {
            SparseVectorList featurevector = new SparseVectorList();

            int lexiconindexcount = Lexicon.Count;

            var content = LuceneOperations.GetDocumentContent(doc, _fieldWeightDict, _leadingSentencesCnt);
            var words   = NLPOperations.Tokenize(content, _tokenizeConfig);

            foreach (var word in words)
            {
                int value = 0;
                if (Lexicon == null || Lexicon.TryGetValue(word, out value) == false)
                {
                    Lexicon.Add(word, lexiconindexcount);
                    value = lexiconindexcount;
                    lexiconindexcount++;
                }
                if (!featurevector.Increase(value, 1))
                {
                    featurevector.Insert(value, 1);
                }
            }

            featurevector.ListToArray();
            featurevector.count = featurevector.keyarray.Length;
            //featurevector.SumUpValueArray();
            if (featurevector.count < 1)
            {
                return(null);
            }
            featurevector.InvalidateList();
            featurevector.GetNorm();
            return(featurevector);
        }
Пример #3
0
        private static int[] getMostFreqWordIndex(SparseVectorList featurevector, int k)
        {
            var sort = new HeapSortDouble(k);

            for (int iword = 0; iword < featurevector.keyarray.Length; iword++)
            {
                sort.Insert(featurevector.keyarray[iword], featurevector.valuearray[iword]);
            }

            return(sort.GetTopIndices().ToArray <int>());
        }
Пример #4
0
    public static double DotProduct(SparseVectorList featurevector1, SparseVectorList featurevector2)
    {
        int    pt1     = 0;
        int    pt2     = 0;
        int    length1 = featurevector1.count;
        int    length2 = featurevector2.count;
        double ret     = 0;

        int[] keys1   = featurevector1.keyarray;
        int[] values1 = featurevector1.valuearray;
        int[] keys2   = featurevector2.keyarray;
        int[] values2 = featurevector2.valuearray;

        while (true)
        {
            while (pt1 < length1 && keys1[pt1] < keys2[pt2])
            {
                pt1++;
            }
            if (pt1 == length1)
            {
                break;
            }
            if (keys1[pt1] == keys2[pt2])
            {
                ret += (double)values1[pt1] * values2[pt2];
                pt1++;
                pt2++;
            }
            else
            {
                while (pt2 < length2 && keys2[pt2] < keys1[pt1])
                {
                    pt2++;
                }
                if (pt2 == length2)
                {
                    break;
                }
                if (keys2[pt2] == keys1[pt1])
                {
                    ret += (double)values1[pt1] * values2[pt2];
                    pt1++;
                    pt2++;
                }
            }
            if (pt1 == length1 || pt2 == length2)
            {
                break;
            }
        }
        return(ret);
    }
Пример #5
0
    public static double Cosine(SparseVectorList featurevector1, SparseVectorList featurevector2)
    {
        double cosine;

        if (featurevector1.count > featurevector2.count)
        {
            cosine = Cosine(featurevector2, featurevector1);
        }
        else
        {
            long t = DateTime.Now.Ticks;
            cosine = DotProduct(featurevector1, featurevector2) / featurevector1.normvalue / featurevector2.normvalue;
        }
        return(cosine);
    }
Пример #6
0
        private List <int> RemoveSimilarDocumentsGranu(IndexReader reader, List <int> orgDocIDs,
                                                       int timeWindowSize, int wordWindowSize)
        {
            var newDocIDs        = new List <int>();
            var removeSimilarity = Configure.MinDistinctiveDocumentCosine;

            Dictionary <int, Dictionary <int, List <SparseVectorList> > > uniqueDocHash = new Dictionary <int, Dictionary <int, List <SparseVectorList> > >();
            int docNum = orgDocIDs.Count;

            int removeDocNum = 0;
            Dictionary <string, int> lexicon = new Dictionary <string, int>();

            int timeslicesize = 1;

            if (timeWindowSize >= 15)
            {
                int[] dividePieceNumbers = new int[] { 3, 4, 5, 7 };
                foreach (int dividePieceNumber in dividePieceNumbers)
                {
                    if (timeWindowSize % dividePieceNumber == 0)
                    {
                        timeslicesize = timeWindowSize / dividePieceNumber;
                        break;
                    }
                }
                if (timeslicesize == 1)
                {
                    timeslicesize  = (timeWindowSize + 2) / 3;
                    timeWindowSize = 3;
                }
                else
                {
                    timeWindowSize /= timeslicesize;
                }
                Console.WriteLine("Reset window size! TimeSliceSize: {0}, WindowSize: {1}", timeslicesize, timeWindowSize);
            }
            int begintimedelta = -(timeWindowSize - 1) / 2;
            int endtimedelta   = timeWindowSize / 2;
            var progress       = new ProgramProgress(docNum);

            StreamWriter debugSw = null;

            if (Configure.IsDebug)
            {
                string fileName = Configure.OutputPath + "debug.txt";
                FileOperations.EnsureFileFolderExist(fileName);
                debugSw = new StreamWriter(fileName, true, Encoding.UTF8);
            }

            foreach (var iDoc in orgDocIDs)
            {
                var doc = reader.Document(iDoc);
                SparseVectorList vector = GetFeatureVector(doc, lexicon);
                if (vector == null)
                {
                    removeDocNum++;
                    continue;
                }

                vector.documentid = iDoc;
                int   time    = getDateTimeBingNews(doc) / timeslicesize;
                int[] words   = getMostFreqWordIndex(vector, wordWindowSize);
                bool  bunqiue = true;
                for (int stime = time + begintimedelta; stime <= time + endtimedelta; stime++)
                {
                    if (uniqueDocHash.ContainsKey(stime))
                    {
                        Dictionary <int, List <SparseVectorList> > wordHash = uniqueDocHash[stime];
                        foreach (int sword in words)
                        {
                            if (wordHash.ContainsKey(sword))
                            {
                                List <SparseVectorList> vectorList = wordHash[sword];
                                foreach (SparseVectorList svector in vectorList)
                                {
                                    if (SparseVectorList.Cosine(svector, vector) >= removeSimilarity)
                                    {
                                        if (Configure.IsDebug && removeDocNum <= 10000)
                                        {
                                            double simi = SparseVectorList.Cosine(svector, vector);
                                            if (simi <= Configure.MaxShowDebugCosine)
                                            {
                                                debugSw.WriteLine("---------------------------------------------------");
                                                debugSw.WriteLine(reader.Document(svector.documentid).Get(BingNewsFields.NewsArticleHeadline)); //Get("NewsArticleDescription"));
                                                debugSw.WriteLine(reader.Document(vector.documentid).Get(BingNewsFields.NewsArticleHeadline));  //Get("NewsArticleDescription"));
                                                debugSw.WriteLine("");
                                                string body1 = reader.Document(svector.documentid).Get(BingNewsFields.NewsArticleDescription);
                                                string body2 = reader.Document(vector.documentid).Get(BingNewsFields.NewsArticleDescription);
                                                if (body1.Length > 100)
                                                {
                                                    body1 = body1.Substring(0, 100);
                                                }
                                                if (body2.Length > 100)
                                                {
                                                    body2 = body2.Substring(0, 100);
                                                }
                                                debugSw.WriteLine(body1);
                                                debugSw.WriteLine(body2);
                                                debugSw.WriteLine(simi);
                                            }
                                            debugSw.Flush();
                                        }
                                        bunqiue = false;
                                        break;
                                    }
                                }
                            }
                            if (!bunqiue)
                            {
                                break;
                            }
                        }
                    }
                    if (!bunqiue)
                    {
                        break;
                    }
                }

                if (bunqiue)
                {
                    int keytime = time;
                    int keyword = words[0];
                    if (!uniqueDocHash.ContainsKey(keytime))
                    {
                        uniqueDocHash.Add(keytime, new Dictionary <int, List <SparseVectorList> >());
                    }
                    Dictionary <int, List <SparseVectorList> > wordHash = uniqueDocHash[keytime];
                    if (!wordHash.ContainsKey(keyword))
                    {
                        wordHash.Add(keyword, new List <SparseVectorList>());
                    }
                    List <SparseVectorList> list = wordHash[keyword];
                    list.Add(vector);

                    newDocIDs.Add(iDoc);
                }
                else
                {
                    removeDocNum++;
                }

                progress.PrintIncrementExperiment();
            }

            Console.WriteLine("Finished remove similar documents. Removed {0} out of {1}", removeDocNum, docNum);

            int listLengthSum = 0, listCnt = 0;

            foreach (Dictionary <int, List <SparseVectorList> > hash0 in uniqueDocHash.Values)
            {
                foreach (List <SparseVectorList> list in hash0.Values)
                {
                    listLengthSum += list.Count;
                    listCnt++;
                }
            }
            Console.WriteLine("AvgListLength: {0}, ListCnt: {1}", listLengthSum / listCnt, listCnt);

            if (Configure.IsDebug)
            {
                debugSw.Flush();
                debugSw.Close();
            }

            return(newDocIDs);
        }