public List <PlagiateDoc> CheckBySingleDoc(string checkDoc, List <string> folders, int threshold, int termCount) { var paths = GetDocsByDirectoty(folders); var docs = GetDocs(paths); var doc = GetDoc(checkDoc); docs.Add(doc); var rows = ClusteringFactory.GetSimilarity(0).MakeSimilarityRows(docs, termCount); var row = rows.FirstOrDefault(x => x.Doc.Equals(doc)); var plagiats = new List <PlagiateDoc>(); if (row != null) { var var11 = row.Similarity.GetEnumerator(); while (var11.MoveNext()) { var entry = var11.Current; if (!entry.Key.Equals(doc) && entry.Value >= threshold) { plagiats.Add(new PlagiateDoc(entry.Key.Path, entry.Value)); } } var11.Dispose(); } return(plagiats); }
protected Dictionary <string, List <long> > GetDocumentShingle(List <Doc> docs, int shingleLength) { var doc2term = new Dictionary <string, List <long> >(); //List<string> stopList = ClusteringFactory.GetMyStem().Parse(MyStem.MYSTEM_DIR + Path.DirectorySeparatorChar + "stop-list.txt"); var crc = CRCFactory.Instance.Create(); var endflag = false; foreach (var doc in docs) { var docIndex = doc.DocIndex; //terms.RemoveAll(x => stopList.Contains(x)); var terms = ClusteringFactory.GetMyStem().Parse(doc.Path); var shingles = new List <long>(); if (terms.Count != 0) { var startIndex = 0; while (true) { if (startIndex + shingleLength > terms.Count) { shingleLength = terms.Count - startIndex; endflag = true; } var shingle = string.Join(" ", terms.GetRange(startIndex, shingleLength)); var hashValue = crc.ComputeHash(Encoding.UTF8.GetBytes(shingle)); var res = BitConverter.ToInt32(hashValue.Hash, 0); var shing = Convert.ToInt64(int.MaxValue) + Math.Abs(int.MinValue + Math.Abs(res)); shingles.Add(shing); if (endflag) { break; } startIndex++; } } if (!doc2term.ContainsKey(docIndex)) { doc2term.Add(docIndex, shingles); } } return(doc2term); }
public List <ClusterDoc> Check(List <string> paths, int threshold, int termCount, int mode) { var docs = GetDocs(paths); var rows = ClusteringFactory.GetSimilarity(mode).MakeSimilarityRows(docs, termCount); #region debug //if (Boolean.valueOf(ApplicationUtil.getProperty("debug.mode"))) { // try { // FileWriter fw = new FileWriter(new File(MyStem.MYSTEM_DIR + File.separator + "result.txt")); // Iterator var7 = rows.iterator(); // while(var7.hasNext()) { // SimilarityRow row = (SimilarityRow)var7.next(); // String str = row.getDoc().getDocIndex() + ": "; // TreeMap<Doc, Integer> map = new TreeMap(new 1(this)); // map.putAll(row.getSimilarity()); // Entry entry; // for(Iterator var11 = map.entrySet().iterator(); var11.hasNext(); str = str + entry.getValue() + " ") { // entry = (Entry) var11.next(); // } // fw.write(str + System.lineSeparator()); // } // fw.close(); // } catch (IOException var13) { // var13.printStackTrace(); // } //} #endregion var clusters = ClusteringFactory.GetGreedy().Clustering(rows, threshold); var cDocs = new List <ClusterDoc>(); clusters.ForEach(cluster => { if (cluster.Docs.Count > 1) { cDocs.Add(new ClusterDoc(cluster)); } }); return(cDocs); }
protected Dictionary <string, List <string> > GetDocumentShingle(List <Doc> docs, int termCount) { var doc2term = new Dictionary <string, List <string> >(); //List<string> stopList = ClusteringFactory.GetMyStem().Parse(MyStem.MYSTEM_DIR + Path.DirectorySeparatorChar + "stop-list.txt"); foreach (var doc in docs) { var docIndex = doc.DocIndex; var terms = ClusteringFactory.GetMyStem().Parse(doc.Path); var countMap = new Dictionary <string, int>(); for (var ind = 0; ind < terms.Count; ++ind) { var term = terms[ind]; if (term.Contains("??")) { term = term.Replace("??", ""); } if (term.Length < 4) { continue; } //исключим стоп-слова,l //if (stopList.Contains(term)) // continue; //подсчитать количество вхождений каждого слова if (!countMap.Keys.Contains(term)) { var count = terms.FindAll(x => x == term).Count; countMap.Add(term, count); } } var sortedMap = new Dictionary <string, int>(); foreach (var entry in countMap) { if (entry.Value > 2) { sortedMap.Add(entry.Key, entry.Value); } } //сортировка по количеству выхождений var resultMap = new Dictionary <string, int>(); foreach (var map in sortedMap.OrderBy(x => x.Value)) { resultMap.Add(map.Key, map.Value); } var resTerms = resultMap.Keys.Count() <= termCount ? new List <string>(resultMap.Keys) : new List <string>(resultMap.Keys).GetRange(0, termCount); if (!doc2term.ContainsKey(docIndex)) { doc2term.Add(docIndex, resTerms); } } return(doc2term); }