protected Dictionary <string, List <long> > GetDocumentShingle(List <Doc> docs, int shingleLength) { var doc2term = new Dictionary <string, List <long> >(); //List<string> stopList = ClusteringFactory.GetMyStem().Parse(MyStem.MYSTEM_DIR + Path.DirectorySeparatorChar + "stop-list.txt"); var crc = CRCFactory.Instance.Create(); var endflag = false; foreach (var doc in docs) { var docIndex = doc.DocIndex; //terms.RemoveAll(x => stopList.Contains(x)); var terms = ClusteringFactory.GetMyStem().Parse(doc.Path); var shingles = new List <long>(); if (terms.Count != 0) { var startIndex = 0; while (true) { if (startIndex + shingleLength > terms.Count) { shingleLength = terms.Count - startIndex; endflag = true; } var shingle = string.Join(" ", terms.GetRange(startIndex, shingleLength)); var hashValue = crc.ComputeHash(Encoding.UTF8.GetBytes(shingle)); var res = BitConverter.ToInt32(hashValue.Hash, 0); var shing = Convert.ToInt64(int.MaxValue) + Math.Abs(int.MinValue + Math.Abs(res)); shingles.Add(shing); if (endflag) { break; } startIndex++; } } if (!doc2term.ContainsKey(docIndex)) { doc2term.Add(docIndex, shingles); } } return(doc2term); }
protected Dictionary <string, List <string> > GetDocumentShingle(List <Doc> docs, int termCount) { var doc2term = new Dictionary <string, List <string> >(); //List<string> stopList = ClusteringFactory.GetMyStem().Parse(MyStem.MYSTEM_DIR + Path.DirectorySeparatorChar + "stop-list.txt"); foreach (var doc in docs) { var docIndex = doc.DocIndex; var terms = ClusteringFactory.GetMyStem().Parse(doc.Path); var countMap = new Dictionary <string, int>(); for (var ind = 0; ind < terms.Count; ++ind) { var term = terms[ind]; if (term.Contains("??")) { term = term.Replace("??", ""); } if (term.Length < 4) { continue; } //исключим стоп-слова,l //if (stopList.Contains(term)) // continue; //подсчитать количество вхождений каждого слова if (!countMap.Keys.Contains(term)) { var count = terms.FindAll(x => x == term).Count; countMap.Add(term, count); } } var sortedMap = new Dictionary <string, int>(); foreach (var entry in countMap) { if (entry.Value > 2) { sortedMap.Add(entry.Key, entry.Value); } } //сортировка по количеству выхождений var resultMap = new Dictionary <string, int>(); foreach (var map in sortedMap.OrderBy(x => x.Value)) { resultMap.Add(map.Key, map.Value); } var resTerms = resultMap.Keys.Count() <= termCount ? new List <string>(resultMap.Keys) : new List <string>(resultMap.Keys).GetRange(0, termCount); if (!doc2term.ContainsKey(docIndex)) { doc2term.Add(docIndex, resTerms); } } return(doc2term); }