Example #1
0
        public void MissingRequestThrowsArgumentException()
        {
            // arrange
            var textMatch = new TextMatch();

            // act/assert
            Assert.Throws <ArgumentException>(() => textMatch.Match(null));
        }
Example #2
0
        public void Should_Return_Nothng_If_There_Is_No_Match()
        {
            var text    = "Polly put the kettle on, polly put the kettle on, polly put the kettle on we’ll all have tea";
            var subText = "world";

            var result         = TextMatch.Find(text, subText);
            var expectedResult = "";

            Assert.AreEqual(expectedResult, result);
        }
Example #3
0
        public void Should_Find_Correct_Positions_When_Text_Is_Long_2()
        {
            var text    = "Polly put the kettle on, polly put the kettle on, polly put the kettle on we’ll all have tea";
            var subText = "ll";

            var result         = TextMatch.Find(text, subText);
            var expectedResult = "3,28,53,78,82";

            Assert.AreEqual(expectedResult, result);
        }
Example #4
0
        public void Should_Find_Correct_Positions_When_Text_Has_5_Words()
        {
            var text    = "Hello world hello world hello";
            var subText = "world";

            var result         = TextMatch.Find(text, subText);
            var expectedResult = "7,19";

            Assert.AreEqual(expectedResult, result);
        }
Example #5
0
        /// <summary>
        ///     Initializes the rule.
        /// </summary>
        /// <param name="name">The name of the HTTP header.</param>
        /// <param name="value">The value of the HTTP header.</param>
        private void InitializeRule(string name, string value)
        {
            _headerNameValue = name ?? string.Empty;
            _headerValueValue = value ?? string.Empty;

            _nameTextMatch = new TextMatch(_headerNameValue, true);
            _valueTextMatch = new TextMatch(_headerValueValue, true);

            this.IsInitialized = true;
        }
Example #6
0
    public static System.String func(System.String input)
    {
        List <TextMatch> textMatches = new List <TextMatch>();
        Regex            r1          = new Regex("^[0-9ก-ฮ][ก-ฮ]?[ก-ฮ]$");
        Regex            r2          = new Regex("^[0-9]{1,4}$");
        Regex            reg         = new Regex("[*'\",_&#^@/.:;+? ]");

        foreach (string text in reg.Replace(input.Replace(@"\n", "\n"), "\n").Replace("-", "\n").Split('\n'))
        {
            if (r1.IsMatch(text))
            {
                textMatches.Add(new TextMatch()
                {
                    Text = text, Match = 1
                });
            }
            else if (r2.IsMatch(text))
            {
                textMatches.Add(new TextMatch()
                {
                    Text = text, Match = 2
                });
            }
            else if (GetProvinces().Find(x => x == text) != null)
            {
                textMatches.Add(new TextMatch()
                {
                    Text = text, Match = 3
                });
            }
        }

        string result          = "";
        int    lastMatch1Index = -1;
        bool   match           = false;

        do
        {
            int startIndex = 0;
            if (lastMatch1Index >= 0 && textMatches.Count >= lastMatch1Index + 1)
            {
                startIndex = lastMatch1Index + 1;
            }

            lastMatch1Index = textMatches.FindIndex(startIndex, x => x.Match == 1);
            if (lastMatch1Index >= 0)
            {
                if (textMatches.Count >= lastMatch1Index + 3)
                {
                    TextMatch tm1 = textMatches[lastMatch1Index];
                    TextMatch tm2 = textMatches[lastMatch1Index + 1];
                    TextMatch tm3 = textMatches[lastMatch1Index + 2];
                    if (tm1.Match == 1 && tm2.Match == 2 && tm3.Match == 3)
                    {
                        result = string.Format(@"{0}\n{1}\n{2}", tm1.Text, tm2.Text, tm3.Text);
                        match  = true;
                    }
                }
            }
            else
            {
                break;
            }
        } while (!match);

        return(result);
    }
 public override System.Web.Mvc.ActionResult GetSubTextPositions(TextMatch.UI.Web.Models.Home.GetSubTextPositionsViewModel inputForm)
 {
     var callInfo = new T4MVC_System_Web_Mvc_ActionResult(Area, Name, ActionNames.GetSubTextPositions);
     ModelUnbinderHelpers.AddRouteValues(callInfo.RouteValueDictionary, GetSubTextPositionsViewModel.FormName, inputForm);
     GetSubTextPositionsOverride(callInfo, inputForm);
     return callInfo;
 }
Example #8
0
        /// <summary>
        ///     Initializes the rule.
        /// </summary>
        /// <param name="value">The value used for the URL rule.</param>
        public void InitializeRule(string value)
        {
            _textMatchValue = value ?? string.Empty;

            _textMatch = new TextMatch(_textMatchValue, true);
            this.IsInitialized = true;
        }
 partial void GetSubTextPositionsOverride(T4MVC_System_Web_Mvc_ActionResult callInfo, TextMatch.UI.Web.Models.Home.GetSubTextPositionsViewModel inputForm);
Example #10
0
 /// <summary>
 ///     Initializes the rule.
 /// </summary>
 /// <param name="httpMethod">The HTTP method for the rule.</param>
 public void InitializeRule(string httpMethod)
 {
     _textMatchValue = httpMethod ?? string.Empty;
     _textMatch = new TextMatch(_textMatchValue, true);
     this.IsInitialized = true;
 }
Example #11
0
        private static List<TextMatch> GetAllApproximateMatchesWithWordSkip(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold)
        {
            List<TextMatch> allMatches = new List<TextMatch>();
            List<int> usedStartwords = new List<int>();

            if (curRashi.startingTextNormalized.Contains("אלא סיד")) {

            }

            string startText = curRashi.startingTextNormalized;
            int wordCount = curRashi.cvWordcount;

            // No point to this unless we have at least 2 words
            if (wordCount < 2) return new List<TextMatch>();

            // Iterate through all the starting words within the phrase, allowing for one word to be ignored
            for (int iWordToIgnore = -1; iWordToIgnore < wordCount; iWordToIgnore++) {

                List<string> rashiwords = Regex.Split(startText.Trim(), @"\s+").ToList();
                List<long> cvhashes = CalculateHashes(rashiwords);

                string alternateStartText = "";
                if (iWordToIgnore >= 0) {
                    cvhashes.RemoveAt(iWordToIgnore);
                    alternateStartText = GetStringWithRemovedWord(startText, iWordToIgnore).Trim();
                }
                else {
                    alternateStartText = startText;
                }

                // Iterate through all possible starting words within the gemara, allowing for the word afterward to be ignored
                for (int iWord = startBound; iWord <= curDaf.allWords.Count - wordCount && iWord + wordCount - 1 <= endBound; iWord++) {

                    // Start from -1 (which means the phrase as is)
                    for (int gemaraWordToIgnore = -1; gemaraWordToIgnore < wordCount; gemaraWordToIgnore++) {

                        // no point in skipping first word - we might as well just let the item start from the next startword
                        if (gemaraWordToIgnore == 0) continue;

                        // and, choose a second word to ignore (-1 means no second word)
                        for (int gemaraWord2ToIgnore = -1; gemaraWord2ToIgnore < wordCount; gemaraWord2ToIgnore++) {

                            // if not skipping first, this is not relevant unless it is also -1
                            if (gemaraWordToIgnore == -1 && gemaraWord2ToIgnore != -1)
                                continue;

                            // we don't need to do things both directions
                            if (gemaraWord2ToIgnore != -1 && gemaraWord2ToIgnore < gemaraWordToIgnore)
                                continue;

                            // if we are skipping a cv word, don't also skip a second word
                            if (iWordToIgnore != -1 && gemaraWord2ToIgnore != -1) {
                                continue;
                            }

                            // if this would bring us to the end, don't do it
                            if (gemaraWord2ToIgnore != -1 && iWord + wordCount >= curDaf.allWords.Count)
                                continue;

                            bool fIsMatch = false;
                            double distance = 0;
                            double totaldistance = 0;

                            if (wordCount >= 4) {

                                int nonMatchAllowance = wordCount/2 - 1;

                                long initialhash = cvhashes[0];
                                if (curDaf.wordhashes[iWord] == initialhash) {
                                    // see if the rest match up
                                    int offset = 0;
                                    fIsMatch = true;
                                    for (int icvword = 1; icvword < wordCount - 1; icvword++) {
                                        if (icvword == gemaraWordToIgnore || icvword == gemaraWord2ToIgnore) {
                                            offset++;
                                        }

                                        // check the hash, and or first letter
                                        if (curDaf.wordhashes[iWord + icvword + offset] != cvhashes[icvword] &&
                                            curDaf.allWords[iWord + icvword + offset][0] != rashiwords[icvword][0]) {

                                                nonMatchAllowance--;

                                                if (nonMatchAllowance < 0) {
                                                fIsMatch = false;
                                                break;
                                            }
                                        }
                                    }
                                }
                            }
                            else {
                                // build the phrase
                                string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount, gemaraWordToIgnore, gemaraWord2ToIgnore);

                                // Now check if it is a match
                                fIsMatch = IsStringMatchup(alternateStartText, targetPhrase, threshold, out distance);
                            }
                            // If it is, add it in.
                            if (fIsMatch) {

                                if (usedStartwords.Contains(iWord)) continue;
                                TextMatch curMatch = new TextMatch();

                                // if gemaraWordToIgnore is -1, then we didn't skip anything in the gemara.
                                // if iWordToIgnore is -1, then we didn't skip anything in the main phrase

                                // whether or not we used the two-letter shortcut, let's calculate full distance here.
                                string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount, gemaraWordToIgnore, gemaraWord2ToIgnore);
                                double dist = ComputeLevenshteinDistanceByWord(alternateStartText, targetPhrase);

                                // add penalty for skipped words
                                if (gemaraWordToIgnore >= 0)
                                    dist += fullWordValue;
                                if (gemaraWord2ToIgnore >= 0)
                                    dist += fullWordValue;
                                if (iWordToIgnore >= 0)
                                    dist += fullWordValue;

                                int normalizedDistance = (int) ((dist + smoothingFactor)/(startText.Length + smoothingFactor)*normalizingFactor);
                                curMatch.score = normalizedDistance;
                                curMatch.textToMatch = curRashi.startingText;

                                // the "text matched" is the actual text of the gemara, including the word we skipped.
                                curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount);
                                curMatch.startWord = iWord;
                                curMatch.endWord = iWord + wordCount - 1;

                                // if we skipped the last word or two words, then we should cut them out of here
                                if (gemaraWordToIgnore == wordCount - 2 && gemaraWord2ToIgnore == wordCount -1) {
                                    curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount - 2);
                                    curMatch.endWord-=2;
                                }
                                else if (gemaraWordToIgnore == wordCount - 1) {
                                    curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount - 1);
                                    curMatch.endWord--;
                                }

                                allMatches.Add(curMatch);

                                usedStartwords.Add(iWord);
                                break;
                            }
                        }
                    }
                }
            }
            return allMatches;
        }
Example #12
0
        private static List<TextMatch> GetAllApproximateMatchesWithAbbrev(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold)
        {
            List<TextMatch> allMatches = new List<TextMatch>();

            string startText = curRashi.startingTextNormalized;
            int wordCount = curRashi.cvWordcount;
            if (wordCount == 0) return allMatches;

            if (startText.Contains("חוצה")) {

            }

            // convert string into an array of words
            string[] startTextWords = Regex.Split(startText, @"\s+");

            // go through all possible starting words in the gemara text
            for (int iStartingWordInGemara = startBound; iStartingWordInGemara <= curDaf.allWords.Count - wordCount && iStartingWordInGemara + wordCount - 1 <= endBound; iStartingWordInGemara++) {

                bool fIsMatch = false;
                int offsetWithinGemara = 0;
                int offsetWithinRashiCV = 0;
                double distance = 0;
                double totaldistance = 0;

                // now we loop according to the number of words in the cv

                // .. keep track of how the gemara text differs from rashi length
                int gemaraDifferential = 0;

                for (int iWordWithinPhrase = 0; iWordWithinPhrase + offsetWithinRashiCV < wordCount; iWordWithinPhrase++) {

                    // first check if the cv word has a quotemark
                    if (startTextWords[iWordWithinPhrase + offsetWithinRashiCV].Contains("\"")) {

                        // get our ראשי תיבות word without the quote mark
                        string cleanRT = startTextWords[iWordWithinPhrase + offsetWithinRashiCV].Replace("\"", "");
                        int maxlen = cleanRT.Length;

                        // let's see if this matches the start of the next few words
                        int curpos = iStartingWordInGemara + iWordWithinPhrase + offsetWithinGemara;
                        fIsMatch = false;

                        if (curpos + maxlen <= curDaf.allWords.Count) {
                            fIsMatch = true;
                            for (int igemaraword = curpos; igemaraword < curpos + maxlen; igemaraword++) {
                                if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos]) {
                                    fIsMatch = false;
                                    break;
                                }
                            }
                            if (fIsMatch) {
                                // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                offsetWithinGemara += maxlen - 1;
                            }
                        }

                        // let's see if we can match by combining the first two into one word
                        if (curpos + maxlen <= curDaf.allWords.Count + 1) {

                            if (!fIsMatch && maxlen > 2) {

                                fIsMatch = true;
                                if (curDaf.allWords[curpos].Length < 2 || curDaf.allWords[curpos][0] != cleanRT[0] || curDaf.allWords[curpos][1] != cleanRT[1]) {
                                    fIsMatch = false;
                                }
                                else {
                                    for (int igemaraword = curpos + 1; igemaraword < curpos + maxlen - 1; igemaraword++) {
                                        if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos + 1]) {
                                            fIsMatch = false;
                                            break;
                                        }
                                    }
                                }
                                if (fIsMatch) {
                                    // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                    offsetWithinGemara += maxlen - 2;
                                }
                            }
                        }

                        // let's see if we can match by combining the first three into one word
                        if (curpos + maxlen <= curDaf.allWords.Count + 2) {

                            if (!fIsMatch && maxlen > 3) {

                                fIsMatch = true;
                                if (curDaf.allWords[curpos].Length < 3 || curDaf.allWords[curpos][0] != cleanRT[0] || curDaf.allWords[curpos][1] != cleanRT[1] ||
                                    curDaf.allWords[curpos][2] != cleanRT[2]) {
                                    fIsMatch = false;
                                }
                                else {
                                    for (int igemaraword = curpos + 1; igemaraword < curpos + maxlen - 2; igemaraword++) {
                                        if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos + 2]) {
                                            fIsMatch = false;
                                            break;
                                        }
                                    }
                                }
                                if (fIsMatch) {
                                    // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                    offsetWithinGemara += maxlen - 3;
                                }
                            }
                        }

                        if (!fIsMatch) break;

                        // now increment the offset to correspond, so that we'll know we're skipping over x number of words
                    }
                    else if (curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase].Contains("\"")) {

                        // get our ראשי תיבות word without the quote mark
                        string cleanRT = curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase].Replace("\"", "");
                        int maxlen = cleanRT.Length;

                        // let's see if this matches the start of the next few words
                        int curpos = iWordWithinPhrase + offsetWithinRashiCV;
                        fIsMatch = false;

                        if (curpos + maxlen <= wordCount) {
                            fIsMatch = true;
                            for (int icvword = curpos; icvword < curpos + maxlen; icvword++) {
                                if (startTextWords[icvword][0] != cleanRT[icvword - curpos]) {
                                    fIsMatch = false;
                                    break;
                                }
                            }
                            if (fIsMatch) {
                                // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                offsetWithinRashiCV += maxlen - 1;
                            }
                        }

                        // let's see if we can match by combining the first two into one word
                        if (curpos + maxlen <= wordCount + 1) {
                            if (!fIsMatch && maxlen > 2) {

                                fIsMatch = true;
                                if (startTextWords[curpos].Length < 2 || startTextWords[curpos][0] != cleanRT[0] || startTextWords[curpos][1] != cleanRT[1]) {
                                    fIsMatch = false;
                                }
                                else {
                                    for (int icvword = curpos + 1; icvword < curpos + maxlen - 1; icvword++) {
                                        if (startTextWords[icvword][0] != cleanRT[icvword - curpos + 1]) {
                                            fIsMatch = false;
                                            break;
                                        }
                                    }
                                }
                                if (fIsMatch) {
                                    // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                    offsetWithinRashiCV += maxlen - 2;
                                }
                            }
                        }

                        // let's see if we can match by combining the first three into one word
                        if (curpos + maxlen <= wordCount + 2) {

                            if (!fIsMatch && maxlen > 3) {

                                fIsMatch = true;
                                if (startTextWords[curpos].Length < 3 || startTextWords[curpos][0] != cleanRT[0] || startTextWords[curpos][1] != cleanRT[1] ||
                                    startTextWords[curpos][2] != cleanRT[2]) {
                                    fIsMatch = false;
                                }
                                else {
                                    for (int icvword = curpos + 1; icvword < curpos + maxlen - 2; icvword++) {
                                        if (startTextWords[icvword][0] != cleanRT[icvword - curpos + 2]) {
                                            fIsMatch = false;
                                            break;
                                        }
                                    }
                                }
                                if (fIsMatch) {
                                    // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                    offsetWithinRashiCV += maxlen - 3;
                                }
                            }
                        }

                        if (!fIsMatch) break;
                    }
                    else {
                        // great, this is a basic compare.
                        bool fMatch = IsStringMatchup(startTextWords[iWordWithinPhrase + offsetWithinRashiCV], curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase], threshold, out distance);
                        totaldistance += distance;
                        // if these words don't match, break and this isn't a match.
                        if (!fMatch) {
                            fIsMatch = false; break;
                        }
                    }
                }

                gemaraDifferential = offsetWithinRashiCV;
                gemaraDifferential -= offsetWithinGemara;

                // If it is, add it in.
                if (fIsMatch) {
                    TextMatch curMatch = new TextMatch();
                    curMatch.textToMatch = curRashi.startingText;
                    curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iStartingWordInGemara, wordCount - gemaraDifferential);
                    curMatch.startWord = iStartingWordInGemara;
                    curMatch.endWord = iStartingWordInGemara + wordCount - gemaraDifferential;

                    // calculate the score, adding in the penalty for abbreviation
                    totaldistance += abbreviationPenalty;
                    int normalizedDistance = (int)((totaldistance + smoothingFactor) / (startText.Length + smoothingFactor) * normalizingFactor);
                    curMatch.score = normalizedDistance;

                    allMatches.Add(curMatch);
                }
            }

            return allMatches;
        }
Example #13
0
        private static List<TextMatch> GetAllApproximateMatches(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold)
        {
            List<TextMatch> allMatches = new List<TextMatch>();

            string startText = curRashi.startingTextNormalized;
            int wordCount = curRashi.cvWordcount;
            if (wordCount == 0) return allMatches;

            // Okay, start going through all the permutations..
            double distance = 0;
            for (int iWord = startBound; iWord <= curDaf.allWords.Count - wordCount && iWord + wordCount - 1 <= endBound; iWord++) {

                bool fIsMatch = false;
                // if phrase is 4 or more words, use the 2-letter hashes
                if (wordCount >= 4) {
                    // get the hashes for the starting text
                    List<long> cvhashes = CalculateHashes(Regex.Split(startText.Trim(), @"\s+").ToList());

                    long initialhash = cvhashes[0];
                    if (curDaf.wordhashes[iWord] == initialhash) {

                        // see if the rest match up
                        int mismatches = 0;
                        for (int icvword = 1; icvword < wordCount; icvword++) {
                            if (curDaf.wordhashes[iWord + icvword] != cvhashes[icvword]) {
                                mismatches++;
                            }
                        }

                        // now we need to decide if we can let it go
                        int allowedMismatches = (int)Math.Ceiling(wordCount * threshold * 1.35);
                        if (mismatches <= allowedMismatches) {
                            distance = mismatches;
                            fIsMatch = true;
                        }
                    }
                }
                else {
                    // build the phrase
                    string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount);

                    // Now check if it is a match.
                    fIsMatch = IsStringMatchup(startText, targetPhrase, threshold, out distance);
                }
                // If it is, add it in.
                if (fIsMatch) {
                    TextMatch curMatch = new TextMatch();
                    curMatch.textToMatch = curRashi.startingText;
                    curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount);
                    curMatch.startWord = iWord;
                    curMatch.endWord = iWord + wordCount - 1;

                    // calculate the score - how distant is it
                    double dist = ComputeLevenshteinDistanceByWord(startText, curMatch.textMatched);
                    int normalizedDistance = (int)((dist + smoothingFactor) / (startText.Length + smoothingFactor) * normalizingFactor);
                    curMatch.score = normalizedDistance;

                    allMatches.Add(curMatch);
                }
            }

            return allMatches;
        }