private static List<GemaraMasechet> GetAllMasechtotWithRashi(string baseDir) { string baseGemaraDir = baseDir + "Talmud\\"; string baseRashiDir = baseDir + "Rashi\\"; Dictionary<string, GemaraMasechet> allMasechtot = new Dictionary<string, GemaraMasechet>(); // OK, so first we want to get all the gemara text by daf. string[] allFiles = Directory.GetFiles(baseGemaraDir, "*.txt"); for (int iFile = 0; iFile < allFiles.Length; iFile++) { string file = allFiles[iFile]; Console.Write("\rProcessing gemara file " + (iFile + 1) + " out of " + allFiles.Length + " "); // now get the text. string[] allLines = File.ReadAllLines(file); // create our gemara masechet unit GemaraMasechet gm = new GemaraMasechet(); gm.masechetNameEng = allLines[0]; gm.masechetNameHeb = allLines[1]; // OK, so now we iterate through all the lines GemaraDaf gd = new GemaraDaf(); foreach (string curLine in allLines) { if (curLine.Trim() == "") continue; // first check if this is a daf heading. if (curLine.StartsWith("Daf ")) { // it is! put it in! if (gd.dafLocation != null && gd.gemaraText.Trim() != "") { gm.allDapim.Add(gd.dafLocation, gd); } // reset.. gd = new GemaraDaf(); gd.dafLocation = curLine; gd.gemaraText = ""; if (fdebug) { if (gd.dafLocation.Contains("10a")) break; } continue; } // if we haven't reached the first daf yet.. if (gd.dafLocation == null) continue; // yay add the text! string cleanGemaraText = CleanText(curLine); // now generate the lists of bad words to ignore MatchCollection mcWordToIgnore = Regex.Matches(cleanGemaraText, @"\([^)]+\)"); int startWord = gd.allWords.Count; // now get all the words. MatchCollection mcWords = Regex.Matches(cleanGemaraText, @"\S+"); foreach (Match m in mcWords) { string curVal = Regex.Replace(m.Value, "[^א-ת \"]", ""); // first, make sure there is stuff that isn't punc if (curVal.Trim() == "") continue; // secondly, make sure this isn't from the ignored text bool fIgnore = false; foreach (Match mToIgnore in mcWordToIgnore) { if (m.Index >= mToIgnore.Index && (m.Index + m.Length) <= (mToIgnore.Index + mToIgnore.Length)) fIgnore = true; } if (fIgnore) continue; // now put this into the dictionary gd.iWordToOrigChar.Add(gd.allWords.Count, gd.gemaraText.Length + m.Index); // now put it into all words gd.allWords.Add(curVal); } int endWord = gd.allWords.Count - 1; gd.lineStartingWordPointers.Add(new Tuple<int, int>(startWord, endWord)); gd.gemaraText += cleanGemaraText + " "; } if (gd.dafLocation != null && gd.gemaraText.Trim() != "") { gm.allDapim.Add(gd.dafLocation, gd); } allMasechtot.Add(gm.masechetNameEng, gm); } // now rashi! allFiles = Directory.GetFiles(baseRashiDir, "*.txt"); for (int iFile = 0; iFile < allFiles.Length; iFile++) { string file = allFiles[iFile]; Console.Write("\rProcessing rashi file " + (iFile + 1) + " out of " + allFiles.Length + " "); // now get the text. string[] allLines = File.ReadAllLines(file); // get the masechet string masechet = allLines[0].Replace("Rashi on ", ""); // now traverse! string curDaf = ""; int curLineN = 0; for (int iLine = 0; iLine < allLines.Length; iLine++) { string curLine = allLines[iLine]; if (curLine.Trim() == "") continue; if (curLine.StartsWith("Line ")) { curLineN = int.Parse(curLine.Replace("Line ", "")) - 1; continue; } // is this a daf heading? if (curLine.StartsWith("Daf ")) { curDaf = curLine; continue; } if (curDaf.Trim() == "") continue; // first of all, remove גמ' and מתני' declerations beginnign parens curLine = Regex.Replace(curLine, @"\s*\([^)]+\)\s*", " "); // check if after the colon there is one word and then it ends curLine = Regex.Replace(curLine, @":\s*\S+$", ":"); // now split by ":" string[] allUnits = Regex.Split(curLine, ": "); for (int iUnit = 0; iUnit < allUnits.Length; iUnit++) { string unitStr = allUnits[iUnit]; if (unitStr.Trim() == "") continue; // if this isn't saved the regular way.. if (!Regex.IsMatch(unitStr, " [-‒–—] ")) { unitStr = Regex.Replace(unitStr, @"^([^.]+)\. ", "$1 - "); } RashiUnit curUnit = new RashiUnit(); Match m = Regex.Match(unitStr, "(.*?) [-‒–—] (.*)"); curUnit.startingText = m.Groups[1].Value.Trim(); curUnit.fullText = unitStr; curUnit.valueText = m.Groups[2].Value; curUnit.lineN = curLineN; string normalizedCV = Regex.Replace(curUnit.startingText, " ו" + "?" + "כו" + "'?" + "$", "").Trim(); normalizedCV = Regex.Replace(normalizedCV, "^(גמ|גמרא|מתני|מתניתין|משנה)'? ", "").Trim(); // if it starts with a הג, then take just 3 words afterward if (curUnit.startingText.StartsWith("ה\"ג")) { normalizedCV = Regex.Match(normalizedCV, "[^ ]+ ([^ ]+( [^ ]+)?( [^ ]+)?)").Groups[1].Value; } // now remove all non-letters, allowing just quotes normalizedCV = Regex.Replace(normalizedCV, "[^א-ת \"]", "").Trim(); curUnit.startingTextNormalized = normalizedCV; curUnit.cvWordcount = CountWords(normalizedCV); if (!allMasechtot[masechet].allDapim.ContainsKey(curDaf)) continue; if (curUnit.startingText == "" || curUnit.valueText == "") continue; if (curUnit.lineN >= allMasechtot[masechet].allDapim[curDaf].lineStartingWordPointers.Count) continue; allMasechtot[masechet].allDapim[curDaf].allRashi.Add(curUnit); } } } //////// Serialize ////////////////// if (fSerializeData) { FileStream fs = new FileStream(baseDir + "szdata.bin", FileMode.Create); BinaryFormatter bf = new BinaryFormatter(); bf.Serialize(fs, allMasechtot.Values.ToList()); fs.Close(); } ///////////////////////////////////// Console.WriteLine("\n"); return allMasechtot.Values.ToList(); }
private static List<TextMatch> GetAllApproximateMatchesWithWordSkip(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold) { List<TextMatch> allMatches = new List<TextMatch>(); List<int> usedStartwords = new List<int>(); if (curRashi.startingTextNormalized.Contains("אלא סיד")) { } string startText = curRashi.startingTextNormalized; int wordCount = curRashi.cvWordcount; // No point to this unless we have at least 2 words if (wordCount < 2) return new List<TextMatch>(); // Iterate through all the starting words within the phrase, allowing for one word to be ignored for (int iWordToIgnore = -1; iWordToIgnore < wordCount; iWordToIgnore++) { List<string> rashiwords = Regex.Split(startText.Trim(), @"\s+").ToList(); List<long> cvhashes = CalculateHashes(rashiwords); string alternateStartText = ""; if (iWordToIgnore >= 0) { cvhashes.RemoveAt(iWordToIgnore); alternateStartText = GetStringWithRemovedWord(startText, iWordToIgnore).Trim(); } else { alternateStartText = startText; } // Iterate through all possible starting words within the gemara, allowing for the word afterward to be ignored for (int iWord = startBound; iWord <= curDaf.allWords.Count - wordCount && iWord + wordCount - 1 <= endBound; iWord++) { // Start from -1 (which means the phrase as is) for (int gemaraWordToIgnore = -1; gemaraWordToIgnore < wordCount; gemaraWordToIgnore++) { // no point in skipping first word - we might as well just let the item start from the next startword if (gemaraWordToIgnore == 0) continue; // and, choose a second word to ignore (-1 means no second word) for (int gemaraWord2ToIgnore = -1; gemaraWord2ToIgnore < wordCount; gemaraWord2ToIgnore++) { // if not skipping first, this is not relevant unless it is also -1 if (gemaraWordToIgnore == -1 && gemaraWord2ToIgnore != -1) continue; // we don't need to do things both directions if (gemaraWord2ToIgnore != -1 && gemaraWord2ToIgnore < gemaraWordToIgnore) continue; // if we are skipping a cv word, don't also skip a second word if (iWordToIgnore != -1 && gemaraWord2ToIgnore != -1) { continue; } // if this would bring us to the end, don't do it if (gemaraWord2ToIgnore != -1 && iWord + wordCount >= curDaf.allWords.Count) continue; bool fIsMatch = false; double distance = 0; double totaldistance = 0; if (wordCount >= 4) { int nonMatchAllowance = wordCount/2 - 1; long initialhash = cvhashes[0]; if (curDaf.wordhashes[iWord] == initialhash) { // see if the rest match up int offset = 0; fIsMatch = true; for (int icvword = 1; icvword < wordCount - 1; icvword++) { if (icvword == gemaraWordToIgnore || icvword == gemaraWord2ToIgnore) { offset++; } // check the hash, and or first letter if (curDaf.wordhashes[iWord + icvword + offset] != cvhashes[icvword] && curDaf.allWords[iWord + icvword + offset][0] != rashiwords[icvword][0]) { nonMatchAllowance--; if (nonMatchAllowance < 0) { fIsMatch = false; break; } } } } } else { // build the phrase string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount, gemaraWordToIgnore, gemaraWord2ToIgnore); // Now check if it is a match fIsMatch = IsStringMatchup(alternateStartText, targetPhrase, threshold, out distance); } // If it is, add it in. if (fIsMatch) { if (usedStartwords.Contains(iWord)) continue; TextMatch curMatch = new TextMatch(); // if gemaraWordToIgnore is -1, then we didn't skip anything in the gemara. // if iWordToIgnore is -1, then we didn't skip anything in the main phrase // whether or not we used the two-letter shortcut, let's calculate full distance here. string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount, gemaraWordToIgnore, gemaraWord2ToIgnore); double dist = ComputeLevenshteinDistanceByWord(alternateStartText, targetPhrase); // add penalty for skipped words if (gemaraWordToIgnore >= 0) dist += fullWordValue; if (gemaraWord2ToIgnore >= 0) dist += fullWordValue; if (iWordToIgnore >= 0) dist += fullWordValue; int normalizedDistance = (int) ((dist + smoothingFactor)/(startText.Length + smoothingFactor)*normalizingFactor); curMatch.score = normalizedDistance; curMatch.textToMatch = curRashi.startingText; // the "text matched" is the actual text of the gemara, including the word we skipped. curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount); curMatch.startWord = iWord; curMatch.endWord = iWord + wordCount - 1; // if we skipped the last word or two words, then we should cut them out of here if (gemaraWordToIgnore == wordCount - 2 && gemaraWord2ToIgnore == wordCount -1) { curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount - 2); curMatch.endWord-=2; } else if (gemaraWordToIgnore == wordCount - 1) { curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount - 1); curMatch.endWord--; } allMatches.Add(curMatch); usedStartwords.Add(iWord); break; } } } } } return allMatches; }
private static List<TextMatch> GetAllApproximateMatches(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold) { List<TextMatch> allMatches = new List<TextMatch>(); string startText = curRashi.startingTextNormalized; int wordCount = curRashi.cvWordcount; if (wordCount == 0) return allMatches; // Okay, start going through all the permutations.. double distance = 0; for (int iWord = startBound; iWord <= curDaf.allWords.Count - wordCount && iWord + wordCount - 1 <= endBound; iWord++) { bool fIsMatch = false; // if phrase is 4 or more words, use the 2-letter hashes if (wordCount >= 4) { // get the hashes for the starting text List<long> cvhashes = CalculateHashes(Regex.Split(startText.Trim(), @"\s+").ToList()); long initialhash = cvhashes[0]; if (curDaf.wordhashes[iWord] == initialhash) { // see if the rest match up int mismatches = 0; for (int icvword = 1; icvword < wordCount; icvword++) { if (curDaf.wordhashes[iWord + icvword] != cvhashes[icvword]) { mismatches++; } } // now we need to decide if we can let it go int allowedMismatches = (int)Math.Ceiling(wordCount * threshold * 1.35); if (mismatches <= allowedMismatches) { distance = mismatches; fIsMatch = true; } } } else { // build the phrase string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount); // Now check if it is a match. fIsMatch = IsStringMatchup(startText, targetPhrase, threshold, out distance); } // If it is, add it in. if (fIsMatch) { TextMatch curMatch = new TextMatch(); curMatch.textToMatch = curRashi.startingText; curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount); curMatch.startWord = iWord; curMatch.endWord = iWord + wordCount - 1; // calculate the score - how distant is it double dist = ComputeLevenshteinDistanceByWord(startText, curMatch.textMatched); int normalizedDistance = (int)((dist + smoothingFactor) / (startText.Length + smoothingFactor) * normalizingFactor); curMatch.score = normalizedDistance; allMatches.Add(curMatch); } } return allMatches; }
private static List<TextMatch> GetAllApproximateMatchesWithAbbrev(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold) { List<TextMatch> allMatches = new List<TextMatch>(); string startText = curRashi.startingTextNormalized; int wordCount = curRashi.cvWordcount; if (wordCount == 0) return allMatches; if (startText.Contains("חוצה")) { } // convert string into an array of words string[] startTextWords = Regex.Split(startText, @"\s+"); // go through all possible starting words in the gemara text for (int iStartingWordInGemara = startBound; iStartingWordInGemara <= curDaf.allWords.Count - wordCount && iStartingWordInGemara + wordCount - 1 <= endBound; iStartingWordInGemara++) { bool fIsMatch = false; int offsetWithinGemara = 0; int offsetWithinRashiCV = 0; double distance = 0; double totaldistance = 0; // now we loop according to the number of words in the cv // .. keep track of how the gemara text differs from rashi length int gemaraDifferential = 0; for (int iWordWithinPhrase = 0; iWordWithinPhrase + offsetWithinRashiCV < wordCount; iWordWithinPhrase++) { // first check if the cv word has a quotemark if (startTextWords[iWordWithinPhrase + offsetWithinRashiCV].Contains("\"")) { // get our ראשי תיבות word without the quote mark string cleanRT = startTextWords[iWordWithinPhrase + offsetWithinRashiCV].Replace("\"", ""); int maxlen = cleanRT.Length; // let's see if this matches the start of the next few words int curpos = iStartingWordInGemara + iWordWithinPhrase + offsetWithinGemara; fIsMatch = false; if (curpos + maxlen <= curDaf.allWords.Count) { fIsMatch = true; for (int igemaraword = curpos; igemaraword < curpos + maxlen; igemaraword++) { if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos]) { fIsMatch = false; break; } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinGemara += maxlen - 1; } } // let's see if we can match by combining the first two into one word if (curpos + maxlen <= curDaf.allWords.Count + 1) { if (!fIsMatch && maxlen > 2) { fIsMatch = true; if (curDaf.allWords[curpos].Length < 2 || curDaf.allWords[curpos][0] != cleanRT[0] || curDaf.allWords[curpos][1] != cleanRT[1]) { fIsMatch = false; } else { for (int igemaraword = curpos + 1; igemaraword < curpos + maxlen - 1; igemaraword++) { if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos + 1]) { fIsMatch = false; break; } } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinGemara += maxlen - 2; } } } // let's see if we can match by combining the first three into one word if (curpos + maxlen <= curDaf.allWords.Count + 2) { if (!fIsMatch && maxlen > 3) { fIsMatch = true; if (curDaf.allWords[curpos].Length < 3 || curDaf.allWords[curpos][0] != cleanRT[0] || curDaf.allWords[curpos][1] != cleanRT[1] || curDaf.allWords[curpos][2] != cleanRT[2]) { fIsMatch = false; } else { for (int igemaraword = curpos + 1; igemaraword < curpos + maxlen - 2; igemaraword++) { if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos + 2]) { fIsMatch = false; break; } } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinGemara += maxlen - 3; } } } if (!fIsMatch) break; // now increment the offset to correspond, so that we'll know we're skipping over x number of words } else if (curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase].Contains("\"")) { // get our ראשי תיבות word without the quote mark string cleanRT = curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase].Replace("\"", ""); int maxlen = cleanRT.Length; // let's see if this matches the start of the next few words int curpos = iWordWithinPhrase + offsetWithinRashiCV; fIsMatch = false; if (curpos + maxlen <= wordCount) { fIsMatch = true; for (int icvword = curpos; icvword < curpos + maxlen; icvword++) { if (startTextWords[icvword][0] != cleanRT[icvword - curpos]) { fIsMatch = false; break; } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinRashiCV += maxlen - 1; } } // let's see if we can match by combining the first two into one word if (curpos + maxlen <= wordCount + 1) { if (!fIsMatch && maxlen > 2) { fIsMatch = true; if (startTextWords[curpos].Length < 2 || startTextWords[curpos][0] != cleanRT[0] || startTextWords[curpos][1] != cleanRT[1]) { fIsMatch = false; } else { for (int icvword = curpos + 1; icvword < curpos + maxlen - 1; icvword++) { if (startTextWords[icvword][0] != cleanRT[icvword - curpos + 1]) { fIsMatch = false; break; } } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinRashiCV += maxlen - 2; } } } // let's see if we can match by combining the first three into one word if (curpos + maxlen <= wordCount + 2) { if (!fIsMatch && maxlen > 3) { fIsMatch = true; if (startTextWords[curpos].Length < 3 || startTextWords[curpos][0] != cleanRT[0] || startTextWords[curpos][1] != cleanRT[1] || startTextWords[curpos][2] != cleanRT[2]) { fIsMatch = false; } else { for (int icvword = curpos + 1; icvword < curpos + maxlen - 2; icvword++) { if (startTextWords[icvword][0] != cleanRT[icvword - curpos + 2]) { fIsMatch = false; break; } } } if (fIsMatch) { // we condensed maxlen words into 1. minus one, because later we'll increment one. offsetWithinRashiCV += maxlen - 3; } } } if (!fIsMatch) break; } else { // great, this is a basic compare. bool fMatch = IsStringMatchup(startTextWords[iWordWithinPhrase + offsetWithinRashiCV], curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase], threshold, out distance); totaldistance += distance; // if these words don't match, break and this isn't a match. if (!fMatch) { fIsMatch = false; break; } } } gemaraDifferential = offsetWithinRashiCV; gemaraDifferential -= offsetWithinGemara; // If it is, add it in. if (fIsMatch) { TextMatch curMatch = new TextMatch(); curMatch.textToMatch = curRashi.startingText; curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iStartingWordInGemara, wordCount - gemaraDifferential); curMatch.startWord = iStartingWordInGemara; curMatch.endWord = iStartingWordInGemara + wordCount - gemaraDifferential; // calculate the score, adding in the penalty for abbreviation totaldistance += abbreviationPenalty; int normalizedDistance = (int)((totaldistance + smoothingFactor) / (startText.Length + smoothingFactor) * normalizingFactor); curMatch.score = normalizedDistance; allMatches.Add(curMatch); } } return allMatches; }
private static int CountUnmatchedUpRashi(GemaraDaf curDaf) { /// This function counts all the Rashi's in a given daf and /// return the amount of rashi's that still don't have a location within /// the gemara text. int toRet = 0; foreach (RashiUnit rashi in curDaf.allRashi) { if (rashi.startWord == -1) toRet++; } return toRet; }