Exemple #1
0
        protected Dictionary <string, List <long> > GetDocumentShingle(List <Doc> docs, int shingleLength)
        {
            var doc2term = new Dictionary <string, List <long> >();

            //List<string> stopList = ClusteringFactory.GetMyStem().Parse(MyStem.MYSTEM_DIR + Path.DirectorySeparatorChar + "stop-list.txt");
            var crc     = CRCFactory.Instance.Create();
            var endflag = false;

            foreach (var doc in docs)
            {
                var docIndex = doc.DocIndex;
                //terms.RemoveAll(x => stopList.Contains(x));
                var terms    = ClusteringFactory.GetMyStem().Parse(doc.Path);
                var shingles = new List <long>();
                if (terms.Count != 0)
                {
                    var startIndex = 0;
                    while (true)
                    {
                        if (startIndex + shingleLength > terms.Count)
                        {
                            shingleLength = terms.Count - startIndex;
                            endflag       = true;
                        }

                        var shingle = string.Join(" ", terms.GetRange(startIndex, shingleLength));

                        var hashValue = crc.ComputeHash(Encoding.UTF8.GetBytes(shingle));
                        var res       = BitConverter.ToInt32(hashValue.Hash, 0);
                        var shing     = Convert.ToInt64(int.MaxValue) + Math.Abs(int.MinValue + Math.Abs(res));

                        shingles.Add(shing);
                        if (endflag)
                        {
                            break;
                        }

                        startIndex++;
                    }
                }

                if (!doc2term.ContainsKey(docIndex))
                {
                    doc2term.Add(docIndex, shingles);
                }
            }

            return(doc2term);
        }
Exemple #2
0
        protected Dictionary <string, List <string> > GetDocumentShingle(List <Doc> docs, int termCount)
        {
            var doc2term = new Dictionary <string, List <string> >();

            //List<string> stopList = ClusteringFactory.GetMyStem().Parse(MyStem.MYSTEM_DIR + Path.DirectorySeparatorChar + "stop-list.txt");

            foreach (var doc in docs)
            {
                var docIndex = doc.DocIndex;

                var terms = ClusteringFactory.GetMyStem().Parse(doc.Path);

                var countMap = new Dictionary <string, int>();

                for (var ind = 0; ind < terms.Count; ++ind)
                {
                    var term = terms[ind];
                    if (term.Contains("??"))
                    {
                        term = term.Replace("??", "");
                    }

                    if (term.Length < 4)
                    {
                        continue;
                    }
                    //исключим стоп-слова,l
                    //if (stopList.Contains(term))
                    //    continue;
                    //подсчитать количество вхождений каждого слова

                    if (!countMap.Keys.Contains(term))
                    {
                        var count = terms.FindAll(x => x == term).Count;
                        countMap.Add(term, count);
                    }
                }

                var sortedMap = new Dictionary <string, int>();
                foreach (var entry in countMap)
                {
                    if (entry.Value > 2)
                    {
                        sortedMap.Add(entry.Key, entry.Value);
                    }
                }

                //сортировка по количеству выхождений
                var resultMap = new Dictionary <string, int>();

                foreach (var map in sortedMap.OrderBy(x => x.Value))
                {
                    resultMap.Add(map.Key, map.Value);
                }

                var resTerms = resultMap.Keys.Count() <= termCount
                    ? new List <string>(resultMap.Keys)
                    : new List <string>(resultMap.Keys).GetRange(0, termCount);

                if (!doc2term.ContainsKey(docIndex))
                {
                    doc2term.Add(docIndex, resTerms);
                }
            }

            return(doc2term);
        }