Example #1
0
    public static void Main(String[] args)
    {
        List <TextFileHasher> textFileHashers = new List <TextFileHasher>();
        string         text1 = "abc\r\ncba\r\nabc";
        TextFileHasher tfh1  = new TextFileHasher("Text1", new MemoryStream(System.Text.Encoding.Default.GetBytes(text1)));

        tfh1.CalculateFileHash();
        textFileHashers.Add(tfh1);

        string         text2 = "def\r\ncba\r\nwet";
        TextFileHasher tfh2  = new TextFileHasher("Text2", new MemoryStream(System.Text.Encoding.Default.GetBytes(text2)));

        tfh2.CalculateFileHash();
        textFileHashers.Add(tfh2);

        string         text3 = "def\r\nbla\r\nwat";
        TextFileHasher tfh3  = new TextFileHasher("Text3", new MemoryStream(System.Text.Encoding.Default.GetBytes(text3)));

        tfh3.CalculateFileHash();
        textFileHashers.Add(tfh3);

        List <string> totalDuplicates = new List <string>();

        // Two nested loops =>  O(n^2) hell... =)
        for (int i = 0; i < textFileHashers.Count; i++)
        {
            // If n - number of hashes in textFileHashers[i] and m - number of equal hashes
            // GetDuplicates has O(n*mlogm). Assumption - comparing two strings is O(1)
            var duplicates = textFileHashers[i].GetDuplicates();
            if (duplicates.Any())
            {
                Console.WriteLine("There is {0} duplicates in the stream ({1})", duplicates.Count, textFileHashers[i].Name);
                duplicates.ForEach(x => Console.WriteLine(x));
            }
            for (int j = i + 1; j < textFileHashers.Count; j++)
            {
                // Get intersection of both Hashes. It should be fast - O(n)
                var equalHashes      = textFileHashers[i].Hashes.Intersect(textFileHashers[j].Hashes);
                int equalHashesCount = equalHashes.Count();
                Console.WriteLine("{0} and {1} have {2} equal hashes", textFileHashers[i].Name, textFileHashers[j].Name, equalHashesCount);

                // For each hash which is on both hashes:
                // Get the lines position in outer textFileHashers
                // Get the lines position in inner textFileHashers
                // Check for equality outer and inner strings
                foreach (var equalHash in equalHashes)
                {
                    var tfh1Positions = textFileHashers[i].GetLinePositions(equalHash);
                    var tfh2Positions = textFileHashers[j].GetLinePositions(equalHash);

                    for (int k = 0; k < tfh1Positions.Count; k++)
                    {
                        var tfh1Pos       = tfh1Positions[k];
                        var tfh1ByteArray = textFileHashers[i].GetLineAsByteArray(tfh1Pos);
                        for (int m = 0; m < tfh2Positions.Count; m++)
                        {
                            var tfh2Pos       = tfh2Positions[m];
                            var tfh2ByteArray = textFileHashers[j].GetLineAsByteArray(tfh2Pos);

                            if (tfh1ByteArray.SequenceEqual(tfh2ByteArray))
                            {
                                var line = System.Text.Encoding.Default.GetString(tfh1ByteArray);
                                duplicates.Add(line);
                            }
                        }
                    }
                }
            }
            totalDuplicates.AddRange(duplicates);
        }

        Console.WriteLine();
        if (totalDuplicates.Count > 0)
        {
            Console.WriteLine("Total number of duplicates: {0}", totalDuplicates.Count);
            Console.WriteLine("#######################");
            totalDuplicates.ForEach(x => Console.WriteLine("{0}", x));
            Console.WriteLine("#######################");
        }
        // Free resources
        foreach (var tfh in textFileHashers)
        {
            tfh.Dispose();
        }
    }
Example #2
0
    public static void Main(String[] args)
    {
        List <TextFileHasher> textFileHashers = new List <TextFileHasher>();
        string         text1 = "abc\r\ncba\r\nabc";
        TextFileHasher tfh1  = new TextFileHasher("Text1", new MemoryStream(System.Text.Encoding.Default.GetBytes(text1)));

        tfh1.CalculateFileHash();
        textFileHashers.Add(tfh1);

        string         text2 = "def\r\ncba\r\nwet";
        TextFileHasher tfh2  = new TextFileHasher("Text2", new MemoryStream(System.Text.Encoding.Default.GetBytes(text2)));

        tfh2.CalculateFileHash();
        textFileHashers.Add(tfh2);

        string         text3 = "def\r\nbla\r\nwat";
        TextFileHasher tfh3  = new TextFileHasher("Text3", new MemoryStream(System.Text.Encoding.Default.GetBytes(text3)));

        tfh3.CalculateFileHash();
        textFileHashers.Add(tfh3);

        List <string> totalDuplicates = new List <string>();

        Dictionary <ulong, Dictionary <TextFileHasher, List <LinePosition> > > totalHashes = new Dictionary <ulong, Dictionary <TextFileHasher, List <LinePosition> > >();

        textFileHashers.ForEach(tfh => {
            foreach (var dummyHash in tfh.Hashes)
            {
                Dictionary <TextFileHasher, List <LinePosition> > tfh2LinePositions = null;
                if (!totalHashes.TryGetValue(dummyHash, out tfh2LinePositions))
                {
                    totalHashes[dummyHash] = new Dictionary <TextFileHasher, List <LinePosition> >()
                    {
                        { tfh, tfh.GetLinePositions(dummyHash) }
                    }
                }
                ;
                else
                {
                    List <LinePosition> linePositions = null;
                    if (!tfh2LinePositions.TryGetValue(tfh, out linePositions))
                    {
                        tfh2LinePositions[tfh] = tfh.GetLinePositions(dummyHash);
                    }
                    else
                    {
                        linePositions.AddRange(tfh.GetLinePositions(dummyHash));
                    }
                }
            }
        });

        HashSet <TextFileHasher> alreadyGotDuplicates = new HashSet <TextFileHasher>();

        foreach (var hash in totalHashes.Keys)
        {
            var tfh2LinePositions = totalHashes[hash];
            var tfh = tfh2LinePositions.Keys.FirstOrDefault();
            // Get duplicates in the TextFileHasher itself
            if (tfh != null && !alreadyGotDuplicates.Contains(tfh))
            {
                totalDuplicates.AddRange(tfh.GetDuplicates());
                alreadyGotDuplicates.Add(tfh);
            }
            if (tfh2LinePositions.Count <= 1)
            {
                continue;
            }
            // Algo to get duplicates in more than 1 TextFileHashers
            var tfhs = tfh2LinePositions.Keys.ToArray();
            for (int i = 0; i < tfhs.Length; i++)
            {
                var tfh1Positions = tfhs[i].GetLinePositions(hash);
                for (int j = i + 1; j < tfhs.Length; j++)
                {
                    var tfh2Positions = tfhs[j].GetLinePositions(hash);

                    for (int k = 0; k < tfh1Positions.Count; k++)
                    {
                        var tfh1Pos       = tfh1Positions[k];
                        var tfh1ByteArray = tfhs[i].GetLineAsByteArray(tfh1Pos);
                        for (int m = 0; m < tfh2Positions.Count; m++)
                        {
                            var tfh2Pos = tfh2Positions[m];
                            if (tfh1Pos.Length != tfh2Pos.Length)
                            {
                                continue;
                            }
                            var tfh2ByteArray = tfhs[j].GetLineAsByteArray(tfh2Pos);

                            if (tfh1ByteArray.SequenceEqual(tfh2ByteArray))
                            {
                                var line = System.Text.Encoding.Default.GetString(tfh1ByteArray);
                                totalDuplicates.Add(line);
                            }
                        }
                    }
                }
            }
        }

        Console.WriteLine();
        if (totalDuplicates.Count > 0)
        {
            Console.WriteLine("Total number of duplicates: {0}", totalDuplicates.Count);
            Console.WriteLine("#######################");
            totalDuplicates.ForEach(x => Console.WriteLine("{0}", x));
            Console.WriteLine("#######################");
        }
        // Free resources
        foreach (var tfh in textFileHashers)
        {
            tfh.Dispose();
        }
    }