private bool BelongsTo(Document document, DocumentCluster cluster) { foreach (Document doc in cluster.Documents) { double oddsRatio = _similarity.CalculateOddsRatio(document, doc); if (oddsRatio >= 2) { return(true); } else { return(false); } } return(false); }
public IEnumerable <DocumentCluster> Cluster(IEnumerable <Document> documents) { List <DocumentCluster> clusters = new List <DocumentCluster>(); DocumentCluster cluster = new DocumentCluster(); foreach (Document document in documents) { cluster = FindCluster(document, clusters); if (null == cluster) { cluster = new DocumentCluster(); clusters.Add(cluster); } cluster.Add(document); } return(clusters); }
private static IEnumerable <DocumentClusterErrorScore> CalculateErrorScore(IEnumerable <DocumentCluster> resultClusters, IEnumerable <DocumentCluster> originalClusters) { List <DocumentClusterErrorScore> result = new List <DocumentClusterErrorScore>(); List <DocumentCluster> resultClustersList = resultClusters.ToList(); List <DocumentCluster> originalClustersList = originalClusters.ToList(); for (int i = 0; i < originalClustersList.Count; i++) { DocumentCluster originalCluster = originalClustersList[i]; int? resultClusterIndex = null; foreach (Document originalDocument in originalCluster) { DocumentClusterErrorScore score = new DocumentClusterErrorScore(); int index = FindClusterIndex(resultClustersList, originalDocument); if (index == -1) { throw new ArgumentException("Can't find doc"); } else if (!resultClusterIndex.HasValue) { resultClusterIndex = index; } else if (index != resultClusterIndex.Value) { score.Value = -1; } else { score.Value = 1; } result.Add(score); } } return(result.AsEnumerable()); }