public ArchiveDocumentType? FindApproximation(string line) { var item = new ApproximationItem(ArchiveDocumentType.Other, line); var foundProximity = 0.5; ApproximationItem foundItem = null; foreach (var approximationItem in Basis) { var proximity = item.FindProximity(approximationItem); if (proximity > foundProximity) { foundProximity = proximity; foundItem = approximationItem; } } return foundItem != null ? foundItem.Type : default(ArchiveDocumentType?); }
public double FindProximity(ApproximationItem item) { var oneLetterOccurrenceCounter = 0; var oneLetterOccurrenceTotal = item.OneLetterOccurrence.Count; foreach (var occurrence in item.OneLetterOccurrence) { if (!OneLetterOccurrence.ContainsKey(occurrence.Key)) continue; var foundOccurrence = OneLetterOccurrence.First(x => x.Key == occurrence.Key); var value = (foundOccurrence.Value <= occurrence.Value) ? foundOccurrence.Value : 2*occurrence.Value - foundOccurrence.Value; oneLetterOccurrenceCounter += value; } var twoLetterOccurrenceCounter = 0; var twoLetterOccurrenceTotal = item.TwoLetterOccurrence.Count; foreach (var occurrence in item.TwoLetterOccurrence) { if (!TwoLetterOccurrence.ContainsKey(occurrence.Key)) continue; var foundOccurrence = TwoLetterOccurrence.First(x => x.Key == occurrence.Key); var value = (foundOccurrence.Value <= occurrence.Value) ? foundOccurrence.Value : 2 * occurrence.Value - foundOccurrence.Value; twoLetterOccurrenceCounter += value; } var threeLetterOccurrenceCounter = 0; var threeLetterOccurrenceTotal = item.ThreeLetterOccurrence.Count; foreach (var occurrence in item.ThreeLetterOccurrence) { if (!ThreeLetterOccurrence.ContainsKey(occurrence.Key)) continue; var foundOccurrence = ThreeLetterOccurrence.First(x => x.Key == occurrence.Key); var value = (foundOccurrence.Value <= occurrence.Value) ? foundOccurrence.Value : 2 * occurrence.Value - foundOccurrence.Value; threeLetterOccurrenceCounter += value; } return (double)(oneLetterOccurrenceCounter + 2*twoLetterOccurrenceCounter + 3*threeLetterOccurrenceCounter)/ (double)(oneLetterOccurrenceTotal + 2*twoLetterOccurrenceTotal + 3*threeLetterOccurrenceTotal); }