Ejemplo n.º 1
0
        //--------------------------------------------------------------------------------------------------------------------------------//
        //STEP-BY-STEP
        public List <int[]> MatchedNgramSet(DocumentProfileStopNWords profile1, DocumentProfileStopNWords profile2, DocumentProfileStopNWords commonProfile)
        {
            List <int[]> setOfMatched = new List <int[]>();

            foreach (var ngram in commonProfile.getNgramsCollection())
            {
                int index1 = profile1.getNgramsCollection().IndexOf(ngram);
                //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!//
                //.....................IndexOF() does not work for the second profile..................//
                int location = 0;
                for (int index2 = 0; index2 < profile2.getNgramsCollection().Count; index2++)
                {
                    int matches = 0;
                    for (int i = 0; i < profile2.getNgramsCollection()[index2].Length; i++)
                    {
                        if (profile2.getNgramsCollection()[index2][i].Equals(ngram[i]))
                        {
                            matches++;
                        }
                    }
                    if (matches == profile2.getNgramsCollection()[index2].Length)
                    {
                        location = index2;
                    }
                }
                setOfMatched.Add(new int[] { location, index1 });
            }
            return(setOfMatched);
        }
Ejemplo n.º 2
0
        //--------------------------------------------------------------------------------------------------------------------------------//

        //Step-2
        //1.Get the interesected profile
        //2.Aplly criterion G.belongsTo(p1 and p2) if member(g,C)<n2,
        //  where C is a list of the most 6 common words, member() is the number of member in g that belong to C
        //3.Return the intersection of the 2 profiles with only the n-gramms that satisfy criterion (2)
        public DocumentProfileStopNWords ApplyMatchCriterion(DocumentProfileStopNWords profile)
        {
            String[]        mostCommon6     = new String[] { "the", "of", "and", "a", "in", "to" };
            List <String[]> nGramCollection = new List <string[]>();

            //iterate through each n-gram
            foreach (var ngram in profile.getNgramsCollection())
            {
                int membersofC = 0;
                //iterate through each word in the n-gram
                foreach (var word in ngram)
                {
                    //iterate through each word in C
                    for (int i = 0; i < mostCommon6.Length; i++)
                    {
                        //if match is found increase members
                        if (word.Equals(mostCommon6[i]))
                        {
                            membersofC++;
                            break;
                        }
                    }
                }
                //if criterion (2) is satisfied add this ngram to the collection
                if ((membersofC < ngram.Length))
                {
                    nGramCollection.Add(ngram);
                }
            }
            return(new DocumentProfileStopNWords(nGramCollection));
        }
Ejemplo n.º 3
0
        //Intersect two profiles (either stopNwords or letter) and return a new intersected profile
        public DocumentProfileStopNWords IntersectProfiles(DocumentProfileStopNWords profile1, DocumentProfileStopNWords profile2)
        {
            List <String[]> intersection = new List <string[]>();

            foreach (var ngram1 in profile1.getNgramsCollection())
            {
                int countEquals = 0;
                foreach (var ngram2 in profile2.getNgramsCollection())
                {
                    int countEqualWords = 0;
                    for (int i = 0; i < ngram1.Length; i++)
                    {
                        if (ngram1[i].Equals(ngram2[i]))
                        {
                            countEqualWords++;
                        }
                    }
                    if (countEqualWords == ngram1.Length)
                    {
                        countEquals++;
                    }
                }
                if (countEquals > 0)
                {
                    intersection.Add(ngram1);
                }
            }
            DocumentProfileStopNWords profile = new DocumentProfileStopNWords(intersection);

            return(profile);
        }
Ejemplo n.º 4
0
        //Step-1
        //1.Get the profiles of the suspicious and original document
        //2.Apply criterion(2)
        //3.Create a list M(dx,ds) with the indexes of the common n-grams (produced in 2.) between the two profiles
        //  i.e. M={(1,1), (2,2), (3,5)} (ngram 1 of the plagiarized document is matched with ngram 1 of original...)
        //4.Return M
        public List <int[]> MatchedNgramSet(string pathToDocument1, string pathToDocument2)
        {
            DocumentProfileBuilder    builder  = new DocumentProfileBuilder();
            DocumentProfileStopNWords profile1 = builder.GetDocumentProfileStopNWords(pathToDocument1, "Document", 8);
            DocumentProfileStopNWords profile2 = builder.GetDocumentProfileStopNWords(pathToDocument2, "Document", 8);

            Library library = new Library();
            DocumentProfileStopNWords commonProfile = ApplyMatchCriterion(library.IntersectProfiles(profile1, profile2));

            List <int[]> setOfMatched = new List <int[]>();

            foreach (var ngram in commonProfile.getNgramsCollection())
            {
                int index1 = profile1.getNgramsCollection().IndexOf(ngram);
                //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!//
                //.....................IndexOF() does not work for the second profile..................//
                int location = 0;
                for (int index2 = 0; index2 < profile2.getNgramsCollection().Count; index2++)
                {
                    int matches = 0;
                    for (int i = 0; i < profile2.getNgramsCollection()[index2].Length; i++)
                    {
                        if (profile2.getNgramsCollection()[index2][i].Equals(ngram[i]))
                        {
                            matches++;
                        }
                    }
                    if (matches == profile2.getNgramsCollection()[index2].Length)
                    {
                        location = index2;
                    }
                }
                setOfMatched.Add(new int[] { location, index1 });
            }
            return(setOfMatched);
        }
Ejemplo n.º 5
0
        //Step-2
        //1.Get the interesected profile
        //2.Aplly criterion G.belongsTo(p1 and p2) if member(g,C)<n-1 and maxseq(g,C)<n-2,
        //  where C is a list of the most 6 common words, member() is the number of member in g that belong to C
        //  and maxseq() is the maximal sequence of members of g that belong to C.
        //3.Return the intersection of the 2 profiles with only the n-gramms that satisfy criterion (1)
        public DocumentProfileStopNWords ApplyCanditateRetrievalCriterion(DocumentProfileStopNWords profile)
        {
            String[]        mostCommon6     = new String[] { "the", "of", "and", "a", "in", "to" };
            List <String[]> nGramCollection = new List <string[]>();

            //iterate through all ngrams that belongs to the interesection of the two profiles
            foreach (var ngram in profile.getNgramsCollection())
            {
                int maxseq     = 0;
                int membersofC = 0;
                int currentsq  = 0;
                //iterate through all members of the ngram
                foreach (var word in ngram)
                {
                    bool found = false;
                    //iterate through all words in C
                    for (int i = 0; i < mostCommon6.Length; i++)
                    {
                        //increase sequence and member if match is found and stop iterating C
                        if (word.Equals(mostCommon6[i]))
                        {
                            membersofC++;
                            currentsq++;
                            found = true;
                            break;
                        }
                    }
                    //if match is not found compare this sequence against previous maximal and update accordingly
                    if (!found)
                    {
                        if (maxseq < currentsq)
                        {
                            maxseq = currentsq;
                        }
                        currentsq = 0;
                    }
                }
                //if criterion (1) is satisfied add this ngram to the collection
                if ((membersofC < ngram.Length - 1) && (maxseq < ngram.Length - 2))
                {
                    nGramCollection.Add(ngram);
                }
            }
            return(new DocumentProfileStopNWords(nGramCollection));
        }