public void MissingRequestThrowsArgumentException() { // arrange var textMatch = new TextMatch(); // act/assert Assert.Throws <ArgumentException>(() => textMatch.Match(null)); }
public void Should_Return_Nothng_If_There_Is_No_Match() { var text = "Polly put the kettle on, polly put the kettle on, polly put the kettle on we’ll all have tea"; var subText = "world"; var result = TextMatch.Find(text, subText); var expectedResult = ""; Assert.AreEqual(expectedResult, result); }
public void Should_Find_Correct_Positions_When_Text_Is_Long_2() { var text = "Polly put the kettle on, polly put the kettle on, polly put the kettle on we’ll all have tea"; var subText = "ll"; var result = TextMatch.Find(text, subText); var expectedResult = "3,28,53,78,82"; Assert.AreEqual(expectedResult, result); }
public void Should_Find_Correct_Positions_When_Text_Has_5_Words() { var text = "Hello world hello world hello"; var subText = "world"; var result = TextMatch.Find(text, subText); var expectedResult = "7,19"; Assert.AreEqual(expectedResult, result); }
/// <summary> /// Initializes the rule. /// </summary> /// <param name="name">The name of the HTTP header.</param> /// <param name="value">The value of the HTTP header.</param> private void InitializeRule(string name, string value) { _headerNameValue = name ?? string.Empty; _headerValueValue = value ?? string.Empty; _nameTextMatch = new TextMatch(_headerNameValue, true); _valueTextMatch = new TextMatch(_headerValueValue, true); this.IsInitialized = true; }
public static System.String func(System.String input) { List <TextMatch> textMatches = new List <TextMatch>(); Regex r1 = new Regex("^[0-9ก-ฮ][ก-ฮ]?[ก-ฮ]$"); Regex r2 = new Regex("^[0-9]{1,4}$"); Regex reg = new Regex("[*'\",_&#^@/.:;+? ]"); foreach (string text in reg.Replace(input.Replace(@"\n", "\n"), "\n").Replace("-", "\n").Split('\n')) { if (r1.IsMatch(text)) { textMatches.Add(new TextMatch() { Text = text, Match = 1 }); } else if (r2.IsMatch(text)) { textMatches.Add(new TextMatch() { Text = text, Match = 2 }); } else if (GetProvinces().Find(x => x == text) != null) { textMatches.Add(new TextMatch() { Text = text, Match = 3 }); } } string result = ""; int lastMatch1Index = -1; bool match = false; do { int startIndex = 0; if (lastMatch1Index >= 0 && textMatches.Count >= lastMatch1Index + 1) { startIndex = lastMatch1Index + 1; } lastMatch1Index = textMatches.FindIndex(startIndex, x => x.Match == 1); if (lastMatch1Index >= 0) { if (textMatches.Count >= lastMatch1Index + 3) { TextMatch tm1 = textMatches[lastMatch1Index]; TextMatch tm2 = textMatches[lastMatch1Index + 1]; TextMatch tm3 = textMatches[lastMatch1Index + 2]; if (tm1.Match == 1 && tm2.Match == 2 && tm3.Match == 3) { result = string.Format(@"{0}\n{1}\n{2}", tm1.Text, tm2.Text, tm3.Text); match = true; } } } else { break; } } while (!match); return(result); }
public override System.Web.Mvc.ActionResult GetSubTextPositions(TextMatch.UI.Web.Models.Home.GetSubTextPositionsViewModel inputForm) { var callInfo = new T4MVC_System_Web_Mvc_ActionResult(Area, Name, ActionNames.GetSubTextPositions); ModelUnbinderHelpers.AddRouteValues(callInfo.RouteValueDictionary, GetSubTextPositionsViewModel.FormName, inputForm); GetSubTextPositionsOverride(callInfo, inputForm); return callInfo; }
/// <summary> /// Initializes the rule. /// </summary> /// <param name="value">The value used for the URL rule.</param> public void InitializeRule(string value) { _textMatchValue = value ?? string.Empty; _textMatch = new TextMatch(_textMatchValue, true); this.IsInitialized = true; }
partial void GetSubTextPositionsOverride(T4MVC_System_Web_Mvc_ActionResult callInfo, TextMatch.UI.Web.Models.Home.GetSubTextPositionsViewModel inputForm);
/// <summary> /// Initializes the rule. /// </summary> /// <param name="httpMethod">The HTTP method for the rule.</param> public void InitializeRule(string httpMethod) { _textMatchValue = httpMethod ?? string.Empty; _textMatch = new TextMatch(_textMatchValue, true); this.IsInitialized = true; }
private static List<TextMatch> GetAllApproximateMatchesWithWordSkip(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold) { List<TextMatch> allMatches = new List<TextMatch>(); List<int> usedStartwords = new List<int>(); if (curRashi.startingTextNormalized.Contains("אלא סיד")) { } string startText = curRashi.startingTextNormalized; int wordCount = curRashi.cvWordcount; // No point to this unless we have at least 2 words if (wordCount < 2) return new List<TextMatch>(); // Iterate through all the starting words within the phrase, allowing for one word to be ignored for (int iWordToIgnore = -1; iWordToIgnore < wordCount; iWordToIgnore++) { List<string> rashiwords = Regex.Split(startText.Trim(), @"\s+").ToList(); List<long> cvhashes = CalculateHashes(rashiwords); string alternateStartText = ""; if (iWordToIgnore >= 0) { cvhashes.RemoveAt(iWordToIgnore); alternateStartText = GetStringWithRemovedWord(startText, iWordToIgnore).Trim(); } else { alternateStartText = startText; } // Iterate through all possible starting words within the gemara, allowing for the word afterward to be ignored for (int iWord = startBound; iWord <= curDaf.allWords.Count - wordCount && iWord + wordCount - 1 <= endBound; iWord++) { // Start from -1 (which means the phrase as is) for (int gemaraWordToIgnore = -1; gemaraWordToIgnore < wordCount; gemaraWordToIgnore++) { // no point in skipping first word - we might as well just let the item start from the next startword if (gemaraWordToIgnore == 0) continue; // and, choose a second word to ignore (-1 means no second word) for (int gemaraWord2ToIgnore = -1; gemaraWord2ToIgnore < wordCount; gemaraWord2ToIgnore++) { // if not skipping first, this is not relevant unless it is also -1 if (gemaraWordToIgnore == -1 && gemaraWord2ToIgnore != -1) continue; // we don't need to do things both directions if (gemaraWord2ToIgnore != -1 && gemaraWord2ToIgnore < gemaraWordToIgnore) continue; // if we are skipping a cv word, don't also skip a second word if (iWordToIgnore != -1 && gemaraWord2ToIgnore != -1) { continue; } // if this would bring us to the end, don't do it if (gemaraWord2ToIgnore != -1 && iWord + wordCount >= curDaf.allWords.Count) continue; bool fIsMatch = false; double distance = 0; double totaldistance = 0; if (wordCount >= 4) { int nonMatchAllowance = wordCount/2 - 1; long initialhash = cvhashes[0]; if (curDaf.wordhashes[iWord] == initialhash) { // see if the rest match up int offset = 0; fIsMatch = true; for (int icvword = 1; icvword < wordCount - 1; icvword++) { if (icvword == gemaraWordToIgnore || icvword == gemaraWord2ToIgnore) { offset++; } // check the hash, and or first letter if (curDaf.wordhashes[iWord + icvword + offset] != cvhashes[icvword] && curDaf.allWords[iWord + icvword + offset][0] != rashiwords[icvword][0]) { nonMatchAllowance--; if (nonMatchAllowance < 0) { fIsMatch = false; break; } } } } } else { // build the phrase string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount, gemaraWordToIgnore, gemaraWord2ToIgnore); // Now check if it is a match fIsMatch = IsStringMatchup(alternateStartText, targetPhrase, threshold, out distance); } // If it is, add it in. if (fIsMatch) { if (usedStartwords.Contains(iWord)) continue; TextMatch curMatch = new TextMatch(); // if gemaraWordToIgnore is -1, then we didn't skip anything in the gemara. // if iWordToIgnore is -1, then we didn't skip anything in the main phrase // whether or not we used the two-letter shortcut, let's calculate full distance here. string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount, gemaraWordToIgnore, gemaraWord2ToIgnore); double dist = ComputeLevenshteinDistanceByWord(alternateStartText, targetPhrase); // add penalty for skipped words if (gemaraWordToIgnore >= 0) dist += fullWordValue; if (gemaraWord2ToIgnore >= 0) dist += fullWordValue; if (iWordToIgnore >= 0) dist += fullWordValue; int normalizedDistance = (int) ((dist + smoothingFactor)/(startText.Length + smoothingFactor)*normalizingFactor); curMatch.score = normalizedDistance; curMatch.textToMatch = curRashi.startingText; // the "text matched" is the actual text of the gemara, including the word we skipped. curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount); curMatch.startWord = iWord; curMatch.endWord = iWord + wordCount - 1; // if we skipped the last word or two words, then we should cut them out of here if (gemaraWordToIgnore == wordCount - 2 && gemaraWord2ToIgnore == wordCount -1) { curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount - 2); curMatch.endWord-=2; } else if (gemaraWordToIgnore == wordCount - 1) { curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount - 1); curMatch.endWord--; } allMatches.Add(curMatch); usedStartwords.Add(iWord); break; } } } } } return allMatches; }
private static List<TextMatch> GetAllApproximateMatchesWithAbbrev(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold) { List<TextMatch> allMatches = new List<TextMatch>(); string startText = curRashi.startingTextNormalized; int wordCount = curRashi.cvWordcount; if (wordCount == 0) return allMatches; if (startText.Contains("חוצה")) { } // convert string into an array of words string[] startTextWords = Regex.Split(startText, @"\s+"); // go through all possible starting words in the gemara text for (int iStartingWordInGemara = startBound; iStartingWordInGemara <= curDaf.allWords.Count - wordCount && iStartingWordInGemara + wordCount - 1 <= endBound; iStartingWordInGemara++) { bool fIsMatch = false; int offsetWithinGemara = 0; int offsetWithinRashiCV = 0; double distance = 0; double totaldistance = 0; // now we loop according to the number of words in the cv // .. keep track of how the gemara text differs from rashi length int gemaraDifferential = 0; for (int iWordWithinPhrase = 0; iWordWithinPhrase + offsetWithinRashiCV < wordCount; iWordWithinPhrase++) { // first check if the cv word has a quotemark if (startTextWords[iWordWithinPhrase + offsetWithinRashiCV].Contains("\"")) { // get our ראשי תיבות word without the quote mark string cleanRT = startTextWords[iWordWithinPhrase + offsetWithinRashiCV].Replace("\"", ""); int maxlen = cleanRT.Length; // let's see if this matches the start of the next few words int curpos = iStartingWordInGemara + iWordWithinPhrase + offsetWithinGemara; fIsMatch = false; if (curpos + maxlen <= curDaf.allWords.Count) { fIsMatch = true; for (int igemaraword = curpos; igemaraword < curpos + maxlen; igemaraword++) { if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos]) { fIsMatch = false; break; } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinGemara += maxlen - 1; } } // let's see if we can match by combining the first two into one word if (curpos + maxlen <= curDaf.allWords.Count + 1) { if (!fIsMatch && maxlen > 2) { fIsMatch = true; if (curDaf.allWords[curpos].Length < 2 || curDaf.allWords[curpos][0] != cleanRT[0] || curDaf.allWords[curpos][1] != cleanRT[1]) { fIsMatch = false; } else { for (int igemaraword = curpos + 1; igemaraword < curpos + maxlen - 1; igemaraword++) { if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos + 1]) { fIsMatch = false; break; } } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinGemara += maxlen - 2; } } } // let's see if we can match by combining the first three into one word if (curpos + maxlen <= curDaf.allWords.Count + 2) { if (!fIsMatch && maxlen > 3) { fIsMatch = true; if (curDaf.allWords[curpos].Length < 3 || curDaf.allWords[curpos][0] != cleanRT[0] || curDaf.allWords[curpos][1] != cleanRT[1] || curDaf.allWords[curpos][2] != cleanRT[2]) { fIsMatch = false; } else { for (int igemaraword = curpos + 1; igemaraword < curpos + maxlen - 2; igemaraword++) { if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos + 2]) { fIsMatch = false; break; } } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinGemara += maxlen - 3; } } } if (!fIsMatch) break; // now increment the offset to correspond, so that we'll know we're skipping over x number of words } else if (curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase].Contains("\"")) { // get our ראשי תיבות word without the quote mark string cleanRT = curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase].Replace("\"", ""); int maxlen = cleanRT.Length; // let's see if this matches the start of the next few words int curpos = iWordWithinPhrase + offsetWithinRashiCV; fIsMatch = false; if (curpos + maxlen <= wordCount) { fIsMatch = true; for (int icvword = curpos; icvword < curpos + maxlen; icvword++) { if (startTextWords[icvword][0] != cleanRT[icvword - curpos]) { fIsMatch = false; break; } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinRashiCV += maxlen - 1; } } // let's see if we can match by combining the first two into one word if (curpos + maxlen <= wordCount + 1) { if (!fIsMatch && maxlen > 2) { fIsMatch = true; if (startTextWords[curpos].Length < 2 || startTextWords[curpos][0] != cleanRT[0] || startTextWords[curpos][1] != cleanRT[1]) { fIsMatch = false; } else { for (int icvword = curpos + 1; icvword < curpos + maxlen - 1; icvword++) { if (startTextWords[icvword][0] != cleanRT[icvword - curpos + 1]) { fIsMatch = false; break; } } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinRashiCV += maxlen - 2; } } } // let's see if we can match by combining the first three into one word if (curpos + maxlen <= wordCount + 2) { if (!fIsMatch && maxlen > 3) { fIsMatch = true; if (startTextWords[curpos].Length < 3 || startTextWords[curpos][0] != cleanRT[0] || startTextWords[curpos][1] != cleanRT[1] || startTextWords[curpos][2] != cleanRT[2]) { fIsMatch = false; } else { for (int icvword = curpos + 1; icvword < curpos + maxlen - 2; icvword++) { if (startTextWords[icvword][0] != cleanRT[icvword - curpos + 2]) { fIsMatch = false; break; } } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinRashiCV += maxlen - 3; } } } if (!fIsMatch) break; } else { // great, this is a basic compare. bool fMatch = IsStringMatchup(startTextWords[iWordWithinPhrase + offsetWithinRashiCV], curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase], threshold, out distance); totaldistance += distance; // if these words don't match, break and this isn't a match. if (!fMatch) { fIsMatch = false; break; } } } gemaraDifferential = offsetWithinRashiCV; gemaraDifferential -= offsetWithinGemara; // If it is, add it in. if (fIsMatch) { TextMatch curMatch = new TextMatch(); curMatch.textToMatch = curRashi.startingText; curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iStartingWordInGemara, wordCount - gemaraDifferential); curMatch.startWord = iStartingWordInGemara; curMatch.endWord = iStartingWordInGemara + wordCount - gemaraDifferential; // calculate the score, adding in the penalty for abbreviation totaldistance += abbreviationPenalty; int normalizedDistance = (int)((totaldistance + smoothingFactor) / (startText.Length + smoothingFactor) * normalizingFactor); curMatch.score = normalizedDistance; allMatches.Add(curMatch); } } return allMatches; }
private static List<TextMatch> GetAllApproximateMatches(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold) { List<TextMatch> allMatches = new List<TextMatch>(); string startText = curRashi.startingTextNormalized; int wordCount = curRashi.cvWordcount; if (wordCount == 0) return allMatches; // Okay, start going through all the permutations.. double distance = 0; for (int iWord = startBound; iWord <= curDaf.allWords.Count - wordCount && iWord + wordCount - 1 <= endBound; iWord++) { bool fIsMatch = false; // if phrase is 4 or more words, use the 2-letter hashes if (wordCount >= 4) { // get the hashes for the starting text List<long> cvhashes = CalculateHashes(Regex.Split(startText.Trim(), @"\s+").ToList()); long initialhash = cvhashes[0]; if (curDaf.wordhashes[iWord] == initialhash) { // see if the rest match up int mismatches = 0; for (int icvword = 1; icvword < wordCount; icvword++) { if (curDaf.wordhashes[iWord + icvword] != cvhashes[icvword]) { mismatches++; } } // now we need to decide if we can let it go int allowedMismatches = (int)Math.Ceiling(wordCount * threshold * 1.35); if (mismatches <= allowedMismatches) { distance = mismatches; fIsMatch = true; } } } else { // build the phrase string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount); // Now check if it is a match. fIsMatch = IsStringMatchup(startText, targetPhrase, threshold, out distance); } // If it is, add it in. if (fIsMatch) { TextMatch curMatch = new TextMatch(); curMatch.textToMatch = curRashi.startingText; curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount); curMatch.startWord = iWord; curMatch.endWord = iWord + wordCount - 1; // calculate the score - how distant is it double dist = ComputeLevenshteinDistanceByWord(startText, curMatch.textMatched); int normalizedDistance = (int)((dist + smoothingFactor) / (startText.Length + smoothingFactor) * normalizingFactor); curMatch.score = normalizedDistance; allMatches.Add(curMatch); } } return allMatches; }