Exemplo n.º 1
0
        private static List<GemaraMasechet> GetAllMasechtotWithRashi(string baseDir)
        {
            string baseGemaraDir = baseDir + "Talmud\\";
            string baseRashiDir = baseDir + "Rashi\\";

            Dictionary<string, GemaraMasechet> allMasechtot = new Dictionary<string, GemaraMasechet>();
            // OK, so first we want to get all the gemara text by daf.
            string[] allFiles = Directory.GetFiles(baseGemaraDir, "*.txt");
            for (int iFile = 0; iFile < allFiles.Length; iFile++) {
                string file = allFiles[iFile];
                Console.Write("\rProcessing gemara file " + (iFile + 1) + " out of " + allFiles.Length + "       ");
                // now get the text.
                string[] allLines = File.ReadAllLines(file);
                // create our gemara masechet unit
                GemaraMasechet gm = new GemaraMasechet();
                gm.masechetNameEng = allLines[0];
                gm.masechetNameHeb = allLines[1];

                // OK, so now we iterate through all the lines
                GemaraDaf gd = new GemaraDaf();
                foreach (string curLine in allLines) {
                    if (curLine.Trim() == "") continue;
                    // first check if this is a daf heading.
                    if (curLine.StartsWith("Daf ")) {
                        // it is! put it in!
                        if (gd.dafLocation != null && gd.gemaraText.Trim() != "") {
                            gm.allDapim.Add(gd.dafLocation, gd);
                        }
                        // reset..
                        gd = new GemaraDaf();
                        gd.dafLocation = curLine;
                        gd.gemaraText = "";

                        if (fdebug) {
                            if (gd.dafLocation.Contains("10a"))
                                break;
                        }

                        continue;
                    }
                    // if we haven't reached the first daf yet..
                    if (gd.dafLocation == null) continue;
                    // yay add the text!
                    string cleanGemaraText = CleanText(curLine);

                    // now generate the lists of bad words to ignore
                    MatchCollection mcWordToIgnore = Regex.Matches(cleanGemaraText, @"\([^)]+\)");

                    int startWord = gd.allWords.Count;
                    // now get all the words.
                    MatchCollection mcWords = Regex.Matches(cleanGemaraText, @"\S+");
                    foreach (Match m in mcWords) {
                        string curVal = Regex.Replace(m.Value, "[^א-ת \"]", "");
                        // first, make sure there is stuff that isn't punc
                        if (curVal.Trim() == "") continue;
                        // secondly, make sure this isn't from the ignored text
                        bool fIgnore = false;
                        foreach (Match mToIgnore in mcWordToIgnore) {
                            if (m.Index >= mToIgnore.Index && (m.Index + m.Length) <= (mToIgnore.Index + mToIgnore.Length))
                                fIgnore = true;
                        }
                        if (fIgnore) continue;

                        // now put this into the dictionary
                        gd.iWordToOrigChar.Add(gd.allWords.Count, gd.gemaraText.Length + m.Index);
                        // now put it into all words
                        gd.allWords.Add(curVal);
                    }
                    int endWord = gd.allWords.Count - 1;

                    gd.lineStartingWordPointers.Add(new Tuple<int, int>(startWord, endWord));
                    gd.gemaraText += cleanGemaraText + " ";
                }
                if (gd.dafLocation != null && gd.gemaraText.Trim() != "") {
                    gm.allDapim.Add(gd.dafLocation, gd);
                }

                allMasechtot.Add(gm.masechetNameEng, gm);
            }

            // now rashi!
            allFiles = Directory.GetFiles(baseRashiDir, "*.txt");
            for (int iFile = 0; iFile < allFiles.Length; iFile++) {
                string file = allFiles[iFile];
                Console.Write("\rProcessing rashi file " + (iFile + 1) + " out of " + allFiles.Length + "       ");
                // now get the text.
                string[] allLines = File.ReadAllLines(file);
                // get the masechet
                string masechet = allLines[0].Replace("Rashi on ", "");

                // now traverse!
                string curDaf = ""; int curLineN = 0;
                for (int iLine = 0; iLine < allLines.Length; iLine++) {
                    string curLine = allLines[iLine];
                    if (curLine.Trim() == "") continue;
                    if (curLine.StartsWith("Line ")) {
                        curLineN = int.Parse(curLine.Replace("Line ", "")) - 1; continue;
                    }

                    // is this a daf heading?
                    if (curLine.StartsWith("Daf ")) {
                        curDaf = curLine; continue;
                    }
                    if (curDaf.Trim() == "") continue;

                    // first of all, remove גמ' and מתני' declerations beginnign parens
                    curLine = Regex.Replace(curLine, @"\s*\([^)]+\)\s*", " ");
                    // check if after the colon there is one word and then it ends
                    curLine = Regex.Replace(curLine, @":\s*\S+$", ":");

                    // now split by ":"
                    string[] allUnits = Regex.Split(curLine, ": ");
                    for (int iUnit = 0; iUnit < allUnits.Length; iUnit++) {
                        string unitStr = allUnits[iUnit];
                        if (unitStr.Trim() == "") continue;

                        // if this isn't saved the regular way..
                        if (!Regex.IsMatch(unitStr, " [-‒–—] ")) {
                            unitStr = Regex.Replace(unitStr, @"^([^.]+)\. ", "$1 - ");
                        }

                        RashiUnit curUnit = new RashiUnit();
                        Match m = Regex.Match(unitStr, "(.*?) [-‒–—] (.*)");
                        curUnit.startingText = m.Groups[1].Value.Trim();
                        curUnit.fullText = unitStr;
                        curUnit.valueText = m.Groups[2].Value;
                        curUnit.lineN = curLineN;

                        string normalizedCV = Regex.Replace(curUnit.startingText, " ו" + "?" + "כו" + "'?" + "$", "").Trim();
                        normalizedCV = Regex.Replace(normalizedCV, "^(גמ|גמרא|מתני|מתניתין|משנה)'? ", "").Trim();

                        // if it starts with a הג, then take just 3 words afterward
                        if (curUnit.startingText.StartsWith("ה\"ג")) {
                            normalizedCV = Regex.Match(normalizedCV, "[^ ]+ ([^ ]+( [^ ]+)?( [^ ]+)?)").Groups[1].Value;
                        }

                        // now remove all non-letters, allowing just quotes
                        normalizedCV = Regex.Replace(normalizedCV, "[^א-ת \"]", "").Trim();

                        curUnit.startingTextNormalized = normalizedCV;
                        curUnit.cvWordcount = CountWords(normalizedCV);

                        if (!allMasechtot[masechet].allDapim.ContainsKey(curDaf)) continue;
                        if (curUnit.startingText == "" || curUnit.valueText == "") continue;
                        if (curUnit.lineN >= allMasechtot[masechet].allDapim[curDaf].lineStartingWordPointers.Count) continue;
                        allMasechtot[masechet].allDapim[curDaf].allRashi.Add(curUnit);
                    }
                }
            }
            //////// Serialize //////////////////
            if (fSerializeData) {
                FileStream fs = new FileStream(baseDir + "szdata.bin", FileMode.Create);
                BinaryFormatter bf = new BinaryFormatter();
                bf.Serialize(fs, allMasechtot.Values.ToList());
                fs.Close();
            }
            /////////////////////////////////////

            Console.WriteLine("\n");
            return allMasechtot.Values.ToList();
        }
Exemplo n.º 2
0
        private static List<TextMatch> GetAllApproximateMatchesWithWordSkip(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold)
        {
            List<TextMatch> allMatches = new List<TextMatch>();
            List<int> usedStartwords = new List<int>();

            if (curRashi.startingTextNormalized.Contains("אלא סיד")) {

            }

            string startText = curRashi.startingTextNormalized;
            int wordCount = curRashi.cvWordcount;

            // No point to this unless we have at least 2 words
            if (wordCount < 2) return new List<TextMatch>();

            // Iterate through all the starting words within the phrase, allowing for one word to be ignored
            for (int iWordToIgnore = -1; iWordToIgnore < wordCount; iWordToIgnore++) {

                List<string> rashiwords = Regex.Split(startText.Trim(), @"\s+").ToList();
                List<long> cvhashes = CalculateHashes(rashiwords);

                string alternateStartText = "";
                if (iWordToIgnore >= 0) {
                    cvhashes.RemoveAt(iWordToIgnore);
                    alternateStartText = GetStringWithRemovedWord(startText, iWordToIgnore).Trim();
                }
                else {
                    alternateStartText = startText;
                }

                // Iterate through all possible starting words within the gemara, allowing for the word afterward to be ignored
                for (int iWord = startBound; iWord <= curDaf.allWords.Count - wordCount && iWord + wordCount - 1 <= endBound; iWord++) {

                    // Start from -1 (which means the phrase as is)
                    for (int gemaraWordToIgnore = -1; gemaraWordToIgnore < wordCount; gemaraWordToIgnore++) {

                        // no point in skipping first word - we might as well just let the item start from the next startword
                        if (gemaraWordToIgnore == 0) continue;

                        // and, choose a second word to ignore (-1 means no second word)
                        for (int gemaraWord2ToIgnore = -1; gemaraWord2ToIgnore < wordCount; gemaraWord2ToIgnore++) {

                            // if not skipping first, this is not relevant unless it is also -1
                            if (gemaraWordToIgnore == -1 && gemaraWord2ToIgnore != -1)
                                continue;

                            // we don't need to do things both directions
                            if (gemaraWord2ToIgnore != -1 && gemaraWord2ToIgnore < gemaraWordToIgnore)
                                continue;

                            // if we are skipping a cv word, don't also skip a second word
                            if (iWordToIgnore != -1 && gemaraWord2ToIgnore != -1) {
                                continue;
                            }

                            // if this would bring us to the end, don't do it
                            if (gemaraWord2ToIgnore != -1 && iWord + wordCount >= curDaf.allWords.Count)
                                continue;

                            bool fIsMatch = false;
                            double distance = 0;
                            double totaldistance = 0;

                            if (wordCount >= 4) {

                                int nonMatchAllowance = wordCount/2 - 1;

                                long initialhash = cvhashes[0];
                                if (curDaf.wordhashes[iWord] == initialhash) {
                                    // see if the rest match up
                                    int offset = 0;
                                    fIsMatch = true;
                                    for (int icvword = 1; icvword < wordCount - 1; icvword++) {
                                        if (icvword == gemaraWordToIgnore || icvword == gemaraWord2ToIgnore) {
                                            offset++;
                                        }

                                        // check the hash, and or first letter
                                        if (curDaf.wordhashes[iWord + icvword + offset] != cvhashes[icvword] &&
                                            curDaf.allWords[iWord + icvword + offset][0] != rashiwords[icvword][0]) {

                                                nonMatchAllowance--;

                                                if (nonMatchAllowance < 0) {
                                                fIsMatch = false;
                                                break;
                                            }
                                        }
                                    }
                                }
                            }
                            else {
                                // build the phrase
                                string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount, gemaraWordToIgnore, gemaraWord2ToIgnore);

                                // Now check if it is a match
                                fIsMatch = IsStringMatchup(alternateStartText, targetPhrase, threshold, out distance);
                            }
                            // If it is, add it in.
                            if (fIsMatch) {

                                if (usedStartwords.Contains(iWord)) continue;
                                TextMatch curMatch = new TextMatch();

                                // if gemaraWordToIgnore is -1, then we didn't skip anything in the gemara.
                                // if iWordToIgnore is -1, then we didn't skip anything in the main phrase

                                // whether or not we used the two-letter shortcut, let's calculate full distance here.
                                string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount, gemaraWordToIgnore, gemaraWord2ToIgnore);
                                double dist = ComputeLevenshteinDistanceByWord(alternateStartText, targetPhrase);

                                // add penalty for skipped words
                                if (gemaraWordToIgnore >= 0)
                                    dist += fullWordValue;
                                if (gemaraWord2ToIgnore >= 0)
                                    dist += fullWordValue;
                                if (iWordToIgnore >= 0)
                                    dist += fullWordValue;

                                int normalizedDistance = (int) ((dist + smoothingFactor)/(startText.Length + smoothingFactor)*normalizingFactor);
                                curMatch.score = normalizedDistance;
                                curMatch.textToMatch = curRashi.startingText;

                                // the "text matched" is the actual text of the gemara, including the word we skipped.
                                curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount);
                                curMatch.startWord = iWord;
                                curMatch.endWord = iWord + wordCount - 1;

                                // if we skipped the last word or two words, then we should cut them out of here
                                if (gemaraWordToIgnore == wordCount - 2 && gemaraWord2ToIgnore == wordCount -1) {
                                    curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount - 2);
                                    curMatch.endWord-=2;
                                }
                                else if (gemaraWordToIgnore == wordCount - 1) {
                                    curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount - 1);
                                    curMatch.endWord--;
                                }

                                allMatches.Add(curMatch);

                                usedStartwords.Add(iWord);
                                break;
                            }
                        }
                    }
                }
            }
            return allMatches;
        }
Exemplo n.º 3
0
        private static List<TextMatch> GetAllApproximateMatches(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold)
        {
            List<TextMatch> allMatches = new List<TextMatch>();

            string startText = curRashi.startingTextNormalized;
            int wordCount = curRashi.cvWordcount;
            if (wordCount == 0) return allMatches;

            // Okay, start going through all the permutations..
            double distance = 0;
            for (int iWord = startBound; iWord <= curDaf.allWords.Count - wordCount && iWord + wordCount - 1 <= endBound; iWord++) {

                bool fIsMatch = false;
                // if phrase is 4 or more words, use the 2-letter hashes
                if (wordCount >= 4) {
                    // get the hashes for the starting text
                    List<long> cvhashes = CalculateHashes(Regex.Split(startText.Trim(), @"\s+").ToList());

                    long initialhash = cvhashes[0];
                    if (curDaf.wordhashes[iWord] == initialhash) {

                        // see if the rest match up
                        int mismatches = 0;
                        for (int icvword = 1; icvword < wordCount; icvword++) {
                            if (curDaf.wordhashes[iWord + icvword] != cvhashes[icvword]) {
                                mismatches++;
                            }
                        }

                        // now we need to decide if we can let it go
                        int allowedMismatches = (int)Math.Ceiling(wordCount * threshold * 1.35);
                        if (mismatches <= allowedMismatches) {
                            distance = mismatches;
                            fIsMatch = true;
                        }
                    }
                }
                else {
                    // build the phrase
                    string targetPhrase = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount);

                    // Now check if it is a match.
                    fIsMatch = IsStringMatchup(startText, targetPhrase, threshold, out distance);
                }
                // If it is, add it in.
                if (fIsMatch) {
                    TextMatch curMatch = new TextMatch();
                    curMatch.textToMatch = curRashi.startingText;
                    curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iWord, wordCount);
                    curMatch.startWord = iWord;
                    curMatch.endWord = iWord + wordCount - 1;

                    // calculate the score - how distant is it
                    double dist = ComputeLevenshteinDistanceByWord(startText, curMatch.textMatched);
                    int normalizedDistance = (int)((dist + smoothingFactor) / (startText.Length + smoothingFactor) * normalizingFactor);
                    curMatch.score = normalizedDistance;

                    allMatches.Add(curMatch);
                }
            }

            return allMatches;
        }
Exemplo n.º 4
0
        private static List<TextMatch> GetAllApproximateMatchesWithAbbrev(GemaraDaf curDaf, RashiUnit curRashi, int startBound, int endBound, double threshold)
        {
            List<TextMatch> allMatches = new List<TextMatch>();

            string startText = curRashi.startingTextNormalized;
            int wordCount = curRashi.cvWordcount;
            if (wordCount == 0) return allMatches;

            if (startText.Contains("חוצה")) {

            }

            // convert string into an array of words
            string[] startTextWords = Regex.Split(startText, @"\s+");

            // go through all possible starting words in the gemara text
            for (int iStartingWordInGemara = startBound; iStartingWordInGemara <= curDaf.allWords.Count - wordCount && iStartingWordInGemara + wordCount - 1 <= endBound; iStartingWordInGemara++) {

                bool fIsMatch = false;
                int offsetWithinGemara = 0;
                int offsetWithinRashiCV = 0;
                double distance = 0;
                double totaldistance = 0;

                // now we loop according to the number of words in the cv

                // .. keep track of how the gemara text differs from rashi length
                int gemaraDifferential = 0;

                for (int iWordWithinPhrase = 0; iWordWithinPhrase + offsetWithinRashiCV < wordCount; iWordWithinPhrase++) {

                    // first check if the cv word has a quotemark
                    if (startTextWords[iWordWithinPhrase + offsetWithinRashiCV].Contains("\"")) {

                        // get our ראשי תיבות word without the quote mark
                        string cleanRT = startTextWords[iWordWithinPhrase + offsetWithinRashiCV].Replace("\"", "");
                        int maxlen = cleanRT.Length;

                        // let's see if this matches the start of the next few words
                        int curpos = iStartingWordInGemara + iWordWithinPhrase + offsetWithinGemara;
                        fIsMatch = false;

                        if (curpos + maxlen <= curDaf.allWords.Count) {
                            fIsMatch = true;
                            for (int igemaraword = curpos; igemaraword < curpos + maxlen; igemaraword++) {
                                if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos]) {
                                    fIsMatch = false;
                                    break;
                                }
                            }
                            if (fIsMatch) {
                                // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                offsetWithinGemara += maxlen - 1;
                            }
                        }

                        // let's see if we can match by combining the first two into one word
                        if (curpos + maxlen <= curDaf.allWords.Count + 1) {

                            if (!fIsMatch && maxlen > 2) {

                                fIsMatch = true;
                                if (curDaf.allWords[curpos].Length < 2 || curDaf.allWords[curpos][0] != cleanRT[0] || curDaf.allWords[curpos][1] != cleanRT[1]) {
                                    fIsMatch = false;
                                }
                                else {
                                    for (int igemaraword = curpos + 1; igemaraword < curpos + maxlen - 1; igemaraword++) {
                                        if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos + 1]) {
                                            fIsMatch = false;
                                            break;
                                        }
                                    }
                                }
                                if (fIsMatch) {
                                    // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                    offsetWithinGemara += maxlen - 2;
                                }
                            }
                        }

                        // let's see if we can match by combining the first three into one word
                        if (curpos + maxlen <= curDaf.allWords.Count + 2) {

                            if (!fIsMatch && maxlen > 3) {

                                fIsMatch = true;
                                if (curDaf.allWords[curpos].Length < 3 || curDaf.allWords[curpos][0] != cleanRT[0] || curDaf.allWords[curpos][1] != cleanRT[1] ||
                                    curDaf.allWords[curpos][2] != cleanRT[2]) {
                                    fIsMatch = false;
                                }
                                else {
                                    for (int igemaraword = curpos + 1; igemaraword < curpos + maxlen - 2; igemaraword++) {
                                        if (curDaf.allWords[igemaraword][0] != cleanRT[igemaraword - curpos + 2]) {
                                            fIsMatch = false;
                                            break;
                                        }
                                    }
                                }
                                if (fIsMatch) {
                                    // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                    offsetWithinGemara += maxlen - 3;
                                }
                            }
                        }

                        if (!fIsMatch) break;

                        // now increment the offset to correspond, so that we'll know we're skipping over x number of words
                    }
                    else if (curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase].Contains("\"")) {

                        // get our ראשי תיבות word without the quote mark
                        string cleanRT = curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase].Replace("\"", "");
                        int maxlen = cleanRT.Length;

                        // let's see if this matches the start of the next few words
                        int curpos = iWordWithinPhrase + offsetWithinRashiCV;
                        fIsMatch = false;

                        if (curpos + maxlen <= wordCount) {
                            fIsMatch = true;
                            for (int icvword = curpos; icvword < curpos + maxlen; icvword++) {
                                if (startTextWords[icvword][0] != cleanRT[icvword - curpos]) {
                                    fIsMatch = false;
                                    break;
                                }
                            }
                            if (fIsMatch) {
                                // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                offsetWithinRashiCV += maxlen - 1;
                            }
                        }

                        // let's see if we can match by combining the first two into one word
                        if (curpos + maxlen <= wordCount + 1) {
                            if (!fIsMatch && maxlen > 2) {

                                fIsMatch = true;
                                if (startTextWords[curpos].Length < 2 || startTextWords[curpos][0] != cleanRT[0] || startTextWords[curpos][1] != cleanRT[1]) {
                                    fIsMatch = false;
                                }
                                else {
                                    for (int icvword = curpos + 1; icvword < curpos + maxlen - 1; icvword++) {
                                        if (startTextWords[icvword][0] != cleanRT[icvword - curpos + 1]) {
                                            fIsMatch = false;
                                            break;
                                        }
                                    }
                                }
                                if (fIsMatch) {
                                    // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                    offsetWithinRashiCV += maxlen - 2;
                                }
                            }
                        }

                        // let's see if we can match by combining the first three into one word
                        if (curpos + maxlen <= wordCount + 2) {

                            if (!fIsMatch && maxlen > 3) {

                                fIsMatch = true;
                                if (startTextWords[curpos].Length < 3 || startTextWords[curpos][0] != cleanRT[0] || startTextWords[curpos][1] != cleanRT[1] ||
                                    startTextWords[curpos][2] != cleanRT[2]) {
                                    fIsMatch = false;
                                }
                                else {
                                    for (int icvword = curpos + 1; icvword < curpos + maxlen - 2; icvword++) {
                                        if (startTextWords[icvword][0] != cleanRT[icvword - curpos + 2]) {
                                            fIsMatch = false;
                                            break;
                                        }
                                    }
                                }
                                if (fIsMatch) {
                                    // we condensed maxlen words into 1. minus one, because later we'll increment one.
                                    offsetWithinRashiCV += maxlen - 3;
                                }
                            }
                        }

                        if (!fIsMatch) break;
                    }
                    else {
                        // great, this is a basic compare.
                        bool fMatch = IsStringMatchup(startTextWords[iWordWithinPhrase + offsetWithinRashiCV], curDaf.allWords[iStartingWordInGemara + offsetWithinGemara + iWordWithinPhrase], threshold, out distance);
                        totaldistance += distance;
                        // if these words don't match, break and this isn't a match.
                        if (!fMatch) {
                            fIsMatch = false; break;
                        }
                    }
                }

                gemaraDifferential = offsetWithinRashiCV;
                gemaraDifferential -= offsetWithinGemara;

                // If it is, add it in.
                if (fIsMatch) {
                    TextMatch curMatch = new TextMatch();
                    curMatch.textToMatch = curRashi.startingText;
                    curMatch.textMatched = BuildPhraseFromArray(curDaf.allWords, iStartingWordInGemara, wordCount - gemaraDifferential);
                    curMatch.startWord = iStartingWordInGemara;
                    curMatch.endWord = iStartingWordInGemara + wordCount - gemaraDifferential;

                    // calculate the score, adding in the penalty for abbreviation
                    totaldistance += abbreviationPenalty;
                    int normalizedDistance = (int)((totaldistance + smoothingFactor) / (startText.Length + smoothingFactor) * normalizingFactor);
                    curMatch.score = normalizedDistance;

                    allMatches.Add(curMatch);
                }
            }

            return allMatches;
        }
Exemplo n.º 5
0
 private static int CountUnmatchedUpRashi(GemaraDaf curDaf)
 {
     /// This function counts all the Rashi's in a given daf and
     /// return the amount of rashi's that still don't have a location within
     /// the gemara text.
     int toRet = 0;
     foreach (RashiUnit rashi in curDaf.allRashi) {
         if (rashi.startWord == -1)
             toRet++;
     }
     return toRet;
 }