public static void Main(String[] args) { List <TextFileHasher> textFileHashers = new List <TextFileHasher>(); string text1 = "abc\r\ncba\r\nabc"; TextFileHasher tfh1 = new TextFileHasher("Text1", new MemoryStream(System.Text.Encoding.Default.GetBytes(text1))); tfh1.CalculateFileHash(); textFileHashers.Add(tfh1); string text2 = "def\r\ncba\r\nwet"; TextFileHasher tfh2 = new TextFileHasher("Text2", new MemoryStream(System.Text.Encoding.Default.GetBytes(text2))); tfh2.CalculateFileHash(); textFileHashers.Add(tfh2); string text3 = "def\r\nbla\r\nwat"; TextFileHasher tfh3 = new TextFileHasher("Text3", new MemoryStream(System.Text.Encoding.Default.GetBytes(text3))); tfh3.CalculateFileHash(); textFileHashers.Add(tfh3); List <string> totalDuplicates = new List <string>(); // Two nested loops => O(n^2) hell... =) for (int i = 0; i < textFileHashers.Count; i++) { // If n - number of hashes in textFileHashers[i] and m - number of equal hashes // GetDuplicates has O(n*mlogm). Assumption - comparing two strings is O(1) var duplicates = textFileHashers[i].GetDuplicates(); if (duplicates.Any()) { Console.WriteLine("There is {0} duplicates in the stream ({1})", duplicates.Count, textFileHashers[i].Name); duplicates.ForEach(x => Console.WriteLine(x)); } for (int j = i + 1; j < textFileHashers.Count; j++) { // Get intersection of both Hashes. It should be fast - O(n) var equalHashes = textFileHashers[i].Hashes.Intersect(textFileHashers[j].Hashes); int equalHashesCount = equalHashes.Count(); Console.WriteLine("{0} and {1} have {2} equal hashes", textFileHashers[i].Name, textFileHashers[j].Name, equalHashesCount); // For each hash which is on both hashes: // Get the lines position in outer textFileHashers // Get the lines position in inner textFileHashers // Check for equality outer and inner strings foreach (var equalHash in equalHashes) { var tfh1Positions = textFileHashers[i].GetLinePositions(equalHash); var tfh2Positions = textFileHashers[j].GetLinePositions(equalHash); for (int k = 0; k < tfh1Positions.Count; k++) { var tfh1Pos = tfh1Positions[k]; var tfh1ByteArray = textFileHashers[i].GetLineAsByteArray(tfh1Pos); for (int m = 0; m < tfh2Positions.Count; m++) { var tfh2Pos = tfh2Positions[m]; var tfh2ByteArray = textFileHashers[j].GetLineAsByteArray(tfh2Pos); if (tfh1ByteArray.SequenceEqual(tfh2ByteArray)) { var line = System.Text.Encoding.Default.GetString(tfh1ByteArray); duplicates.Add(line); } } } } } totalDuplicates.AddRange(duplicates); } Console.WriteLine(); if (totalDuplicates.Count > 0) { Console.WriteLine("Total number of duplicates: {0}", totalDuplicates.Count); Console.WriteLine("#######################"); totalDuplicates.ForEach(x => Console.WriteLine("{0}", x)); Console.WriteLine("#######################"); } // Free resources foreach (var tfh in textFileHashers) { tfh.Dispose(); } }
public static void Main(String[] args) { List <TextFileHasher> textFileHashers = new List <TextFileHasher>(); string text1 = "abc\r\ncba\r\nabc"; TextFileHasher tfh1 = new TextFileHasher("Text1", new MemoryStream(System.Text.Encoding.Default.GetBytes(text1))); tfh1.CalculateFileHash(); textFileHashers.Add(tfh1); string text2 = "def\r\ncba\r\nwet"; TextFileHasher tfh2 = new TextFileHasher("Text2", new MemoryStream(System.Text.Encoding.Default.GetBytes(text2))); tfh2.CalculateFileHash(); textFileHashers.Add(tfh2); string text3 = "def\r\nbla\r\nwat"; TextFileHasher tfh3 = new TextFileHasher("Text3", new MemoryStream(System.Text.Encoding.Default.GetBytes(text3))); tfh3.CalculateFileHash(); textFileHashers.Add(tfh3); List <string> totalDuplicates = new List <string>(); Dictionary <ulong, Dictionary <TextFileHasher, List <LinePosition> > > totalHashes = new Dictionary <ulong, Dictionary <TextFileHasher, List <LinePosition> > >(); textFileHashers.ForEach(tfh => { foreach (var dummyHash in tfh.Hashes) { Dictionary <TextFileHasher, List <LinePosition> > tfh2LinePositions = null; if (!totalHashes.TryGetValue(dummyHash, out tfh2LinePositions)) { totalHashes[dummyHash] = new Dictionary <TextFileHasher, List <LinePosition> >() { { tfh, tfh.GetLinePositions(dummyHash) } } } ; else { List <LinePosition> linePositions = null; if (!tfh2LinePositions.TryGetValue(tfh, out linePositions)) { tfh2LinePositions[tfh] = tfh.GetLinePositions(dummyHash); } else { linePositions.AddRange(tfh.GetLinePositions(dummyHash)); } } } }); HashSet <TextFileHasher> alreadyGotDuplicates = new HashSet <TextFileHasher>(); foreach (var hash in totalHashes.Keys) { var tfh2LinePositions = totalHashes[hash]; var tfh = tfh2LinePositions.Keys.FirstOrDefault(); // Get duplicates in the TextFileHasher itself if (tfh != null && !alreadyGotDuplicates.Contains(tfh)) { totalDuplicates.AddRange(tfh.GetDuplicates()); alreadyGotDuplicates.Add(tfh); } if (tfh2LinePositions.Count <= 1) { continue; } // Algo to get duplicates in more than 1 TextFileHashers var tfhs = tfh2LinePositions.Keys.ToArray(); for (int i = 0; i < tfhs.Length; i++) { var tfh1Positions = tfhs[i].GetLinePositions(hash); for (int j = i + 1; j < tfhs.Length; j++) { var tfh2Positions = tfhs[j].GetLinePositions(hash); for (int k = 0; k < tfh1Positions.Count; k++) { var tfh1Pos = tfh1Positions[k]; var tfh1ByteArray = tfhs[i].GetLineAsByteArray(tfh1Pos); for (int m = 0; m < tfh2Positions.Count; m++) { var tfh2Pos = tfh2Positions[m]; if (tfh1Pos.Length != tfh2Pos.Length) { continue; } var tfh2ByteArray = tfhs[j].GetLineAsByteArray(tfh2Pos); if (tfh1ByteArray.SequenceEqual(tfh2ByteArray)) { var line = System.Text.Encoding.Default.GetString(tfh1ByteArray); totalDuplicates.Add(line); } } } } } } Console.WriteLine(); if (totalDuplicates.Count > 0) { Console.WriteLine("Total number of duplicates: {0}", totalDuplicates.Count); Console.WriteLine("#######################"); totalDuplicates.ForEach(x => Console.WriteLine("{0}", x)); Console.WriteLine("#######################"); } // Free resources foreach (var tfh in textFileHashers) { tfh.Dispose(); } }