コード例 #1
0
        public List <PlagiateDoc> CheckBySingleDoc(string checkDoc, List <string> folders, int threshold, int termCount)
        {
            var paths = GetDocsByDirectoty(folders);
            var docs  = GetDocs(paths);
            var doc   = GetDoc(checkDoc);

            docs.Add(doc);
            var rows = ClusteringFactory.GetSimilarity(0).MakeSimilarityRows(docs, termCount);
            var row  = rows.FirstOrDefault(x => x.Doc.Equals(doc));

            var plagiats = new List <PlagiateDoc>();

            if (row != null)
            {
                var var11 = row.Similarity.GetEnumerator();

                while (var11.MoveNext())
                {
                    var entry = var11.Current;
                    if (!entry.Key.Equals(doc) && entry.Value >= threshold)
                    {
                        plagiats.Add(new PlagiateDoc(entry.Key.Path, entry.Value));
                    }
                }

                var11.Dispose();
            }

            return(plagiats);
        }
コード例 #2
0
        protected Dictionary <string, List <long> > GetDocumentShingle(List <Doc> docs, int shingleLength)
        {
            var doc2term = new Dictionary <string, List <long> >();

            //List<string> stopList = ClusteringFactory.GetMyStem().Parse(MyStem.MYSTEM_DIR + Path.DirectorySeparatorChar + "stop-list.txt");
            var crc     = CRCFactory.Instance.Create();
            var endflag = false;

            foreach (var doc in docs)
            {
                var docIndex = doc.DocIndex;
                //terms.RemoveAll(x => stopList.Contains(x));
                var terms    = ClusteringFactory.GetMyStem().Parse(doc.Path);
                var shingles = new List <long>();
                if (terms.Count != 0)
                {
                    var startIndex = 0;
                    while (true)
                    {
                        if (startIndex + shingleLength > terms.Count)
                        {
                            shingleLength = terms.Count - startIndex;
                            endflag       = true;
                        }

                        var shingle = string.Join(" ", terms.GetRange(startIndex, shingleLength));

                        var hashValue = crc.ComputeHash(Encoding.UTF8.GetBytes(shingle));
                        var res       = BitConverter.ToInt32(hashValue.Hash, 0);
                        var shing     = Convert.ToInt64(int.MaxValue) + Math.Abs(int.MinValue + Math.Abs(res));

                        shingles.Add(shing);
                        if (endflag)
                        {
                            break;
                        }

                        startIndex++;
                    }
                }

                if (!doc2term.ContainsKey(docIndex))
                {
                    doc2term.Add(docIndex, shingles);
                }
            }

            return(doc2term);
        }
コード例 #3
0
        public List <ClusterDoc> Check(List <string> paths, int threshold, int termCount, int mode)
        {
            var docs = GetDocs(paths);
            var rows = ClusteringFactory.GetSimilarity(mode).MakeSimilarityRows(docs, termCount);

            #region debug

            //if (Boolean.valueOf(ApplicationUtil.getProperty("debug.mode"))) {
            //	try {
            //		FileWriter fw = new FileWriter(new File(MyStem.MYSTEM_DIR + File.separator + "result.txt"));
            //		Iterator var7 = rows.iterator();

            //		while(var7.hasNext()) {
            //			SimilarityRow row = (SimilarityRow)var7.next();
            //			String str = row.getDoc().getDocIndex() + ": ";
            //			TreeMap<Doc, Integer> map = new TreeMap(new 1(this));
            //			map.putAll(row.getSimilarity());

            //			Entry entry;
            //			for(Iterator var11 = map.entrySet().iterator(); var11.hasNext(); str = str + entry.getValue() + " ") {
            //				entry = (Entry) var11.next();
            //			}

            //			fw.write(str + System.lineSeparator());
            //		}

            //		fw.close();
            //	} catch (IOException var13) {
            //		var13.printStackTrace();
            //	}
            //}

            #endregion

            var clusters = ClusteringFactory.GetGreedy().Clustering(rows, threshold);
            var cDocs    = new List <ClusterDoc>();
            clusters.ForEach(cluster =>
            {
                if (cluster.Docs.Count > 1)
                {
                    cDocs.Add(new ClusterDoc(cluster));
                }
            });
            return(cDocs);
        }
コード例 #4
0
        protected Dictionary <string, List <string> > GetDocumentShingle(List <Doc> docs, int termCount)
        {
            var doc2term = new Dictionary <string, List <string> >();

            //List<string> stopList = ClusteringFactory.GetMyStem().Parse(MyStem.MYSTEM_DIR + Path.DirectorySeparatorChar + "stop-list.txt");

            foreach (var doc in docs)
            {
                var docIndex = doc.DocIndex;

                var terms = ClusteringFactory.GetMyStem().Parse(doc.Path);

                var countMap = new Dictionary <string, int>();

                for (var ind = 0; ind < terms.Count; ++ind)
                {
                    var term = terms[ind];
                    if (term.Contains("??"))
                    {
                        term = term.Replace("??", "");
                    }

                    if (term.Length < 4)
                    {
                        continue;
                    }
                    //исключим стоп-слова,l
                    //if (stopList.Contains(term))
                    //    continue;
                    //подсчитать количество вхождений каждого слова

                    if (!countMap.Keys.Contains(term))
                    {
                        var count = terms.FindAll(x => x == term).Count;
                        countMap.Add(term, count);
                    }
                }

                var sortedMap = new Dictionary <string, int>();
                foreach (var entry in countMap)
                {
                    if (entry.Value > 2)
                    {
                        sortedMap.Add(entry.Key, entry.Value);
                    }
                }

                //сортировка по количеству выхождений
                var resultMap = new Dictionary <string, int>();

                foreach (var map in sortedMap.OrderBy(x => x.Value))
                {
                    resultMap.Add(map.Key, map.Value);
                }

                var resTerms = resultMap.Keys.Count() <= termCount
                    ? new List <string>(resultMap.Keys)
                    : new List <string>(resultMap.Keys).GetRange(0, termCount);

                if (!doc2term.ContainsKey(docIndex))
                {
                    doc2term.Add(docIndex, resTerms);
                }
            }

            return(doc2term);
        }