예제 #1
0
 protected override void BeforeConsumption()
 {
     base.BeforeConsumption();
     Log(LogLevel.MajorInfo, "Started");
     t             = null;
     newElement    = null;
     lastTag       = null;
     prev          = null;
     isFirstWindow = true;
     isLastWindow  = false;
     prevVowel     = null;
     //            prevConsonant = null;
     prevElement = null;
 }
예제 #2
0
 protected override void BeforeConsumption()
 {
     base.BeforeConsumption();
     Console.WriteLine("parser started");
     t             = null;
     newElement    = null;
     lastTag       = null;
     prev          = null;
     isFirstWindow = true;
     isLastWindow  = false;
     prevVowel     = null;
     prevConsonant = null;
     prevElement   = null;
 }
        protected override void AfterConsumption()
        {
            base.AfterConsumption();
            switch (state)
            {
            case TokenState.Neutral:
                if (sb.Length > 0)
                {
                    Token t = new Token(sb.ToString());
                    this.Emit(new Token(sb.ToString()));
                    Console.WriteLine("tokenizer: Ate neutral token " + t.Value);
                }
                break;

            case TokenState.Letter:
                LetterToken lt = new LetterToken(letter, sb.ToString());
                this.Emit(lt);
                Console.WriteLine("tokenizer: Ate letter " + lt.Value);
                break;
            }
            _DoneProducing();
            Console.WriteLine("tokenizer finished");
        }
        protected override void AfterConsumption()
        {
            switch (state)
            {
            case TokenState.Neutral:
                if (sb.Length > 0)
                {
                    Token t = new Token(sb.ToString());
                    Log("Producing neutral token " + t.Value);
                    this.Emit(t);
                }
                break;

            case TokenState.Letter:
                LetterToken lt = new LetterToken(letter, sb.ToString());
                Log("Producing letter " + lt.Value);
                this.Emit(lt);
                break;
            }
            Log(LogLevel.MajorInfo, "Finished");
            base.AfterConsumption();
            _DoneProducing();
        }
        protected override void Consume(Queue <char> InQueue)
        {
            char c = InQueue.Dequeue();

            _ItemConsumed(c);
//			Console.WriteLine("Tokenizing input...");
            if (state == TokenState.Letter)
            {
                if (HebrewChar.IsModifier(c))
                {
                    sb.Append(c);
                    return;
                }
                else
                {
                    LetterToken lt = new LetterToken(letter, sb.ToString());
                    this.Emit(lt);
                    Console.WriteLine("tokenizer: Ate letter " + lt.Value);
                    sb.Length = 0;
                    state     = TokenState.Neutral;
                }
            }

            switch (state)
            {
            case TokenState.Tag:
                if (c == '/')
                {
                    state = TokenState.Neutral;
                    TagToken tt = new TagToken(sb.ToString());
                    this.Emit(tt);
                    Console.WriteLine("tokenizer: Ate tag " + tt.Type);
                    sb.Length = 0;
                }
                else
                {
                    sb.Append(c);
                }
                break;

            case TokenState.Neutral:
                if (HebrewChar.IsLetter(c) || (c == '/'))
                {
                    if (sb.Length > 0)
                    {
                        Token t = new Token(sb.ToString());
                        this.Emit(t);
                        Console.WriteLine("tokenizer: Ate neutral token " + t.Value);
                    }
                    if (c == '/')
                    {
                        state = TokenState.Tag;
                    }
                    else
                    {
                        letter = c;
                        state  = TokenState.Letter;
                    }
                    sb.Length = 0;
                }
                else if (HebrewChar.IsCantillation(c))
                {
                    CantillationToken ct = new CantillationToken(c.ToString());
                    this.Emit(ct);
                    Console.WriteLine("tokenizer: Ate cantillation mark " + ct.Value);
                }
                else if (HebrewChar.IsPunctuation(c))
                {
                    PunctuationToken pt = new PunctuationToken(c.ToString());
                    this.Emit(pt);
                    Console.WriteLine("tokenizer: Ate punctuation " + pt.Value);
                }
                else
                {
                    sb.Append(c);
                }
                break;
            }
        }
예제 #6
0
        protected override void Consume(Queue <Token> InQueue)
        {
            Token[] tokens;

            lock (InQueue)
            {
                tokens = InQueue.ToArray();
                InQueue.Dequeue();
            }
            int tokensToConsume = 1;

            if (tokensToConsume > windowSize)
            {
                tokensToConsume -= windowSize;
            }
            else
            {
                isLastWindow = !this.IsRunning;
            }
            for (int i = 0; i < tokensToConsume;)
            {
                newElement = null;
                t          = tokens[i];
                _ItemConsumed(t);
                if (!(t is LetterToken))
                {
                    if (t is TagToken)
                    {
                        newElement = new WordTag(((TagToken)t).Type);
                        prev       = null;
                        i++;
                    }
                    else if (t is CantillationToken)
                    {
                        newElement = new Cantillation((t as CantillationToken).Value[0]);
                        i++;
                    }
                    else
                    {
                        newElement = new Separator(t.Value);
                        prev       = null;
                        i++;
                    }
                }
                else
                {
                    LetterToken next      = null;
                    LetterToken further   = null;
                    int         nextIndex = -1 /*, furtherIndex = -1*/;
                    int         j;

                    /*j=i-1;
                     * while ((j>0)&&!(tokens[j] is LetterToken)) {
                     * if (!(tokens[j] is CantillationToken))
                     *  break;
                     * j--;
                     * }
                     * if (j>=0) {
                     * prev=tokens[j] as LetterToken;
                     * prevIndex=j;
                     * }*/
                    j = i + 1;
                    while ((j < tokens.Length - 1) && !(tokens[j] is LetterToken))
                    {
                        if (!(tokens[j] is CantillationToken))
                        {
                            break;
                        }
                        j++;
                    }
                    if (j < tokens.Length)
                    {
                        next      = tokens[j] as LetterToken;
                        nextIndex = j;
                    }
                    j++;
                    while ((j < tokens.Length - 1) && !(tokens[j] is LetterToken))
                    {
                        if (!(tokens[j] is CantillationToken))
                        {
                            break;
                        }
                        j++;
                    }
                    if (j < tokens.Length)
                    {
                        further = tokens[j] as LetterToken;
                        //                        furtherIndex = j;
                    }
                    bool curIsWordEnd = (isLastWindow && (i == tokens.Length - 1)) ||
                                        (next == null);
                    bool curIsWordStart = (isFirstWindow && (i == 0)) || (prev == null);
                    // Look for a consonant
                    LetterToken l = (LetterToken)t;
                    switch (l.Letter)
                    {
                    case 'א':
                        newElement = new Consonant(Consonants.Aleph);
                        break;

                    case 'ב':
                        if (l.HasDagesh)
                        {
                            newElement = new Consonant(Consonants.Bet);
                        }
                        else
                        {
                            newElement = new Consonant(Consonants.Vet);
                        }
                        break;

                    case 'ג':
                        if (l.HasApostrophe)
                        {
                            newElement = new Consonant(Consonants.Jimmel);
                        }
                        else
                        {
                            newElement = new Consonant(Consonants.Gimmel);
                        }
                        break;

                    case 'ד':
                        newElement = new Consonant(Consonants.Dalet);
                        break;

                    case 'ה':
                        newElement = new Consonant(Consonants.He);
                        break;

                    case 'ו':
                        if ((l.HasDagesh && l.HasAnyVowels) || l.HasAnyVowelsExcept('\u05B9'))
                        {
                            newElement = new Consonant(Consonants.Vav);
                        }
                        else if ((next != null) && (next.Letter == 'ו') && (next.HasAnyModifier('\u05B9', HebrewChar.Shuruk)))
                        {
                            newElement = new Consonant(Consonants.Vav);
                        }
                        else
                        {
                            v = l.FirstVowel;
                            switch (v)
                            {
                            case '\u05B9':
                                newElement = new Vowel(Vowels.HolamMale);
                                break;

                            case '\0':
                                if (l.HasShuruk)
                                {
                                    if (curIsWordStart)
                                    {
                                        AddElement(newElement = new Consonant(Consonants.Aleph));
                                        //												Log.Parser.WriteLine("Added consonant "+newElement.Latin+" (sonority "+((Consonant)newElement).Sonority+")");
                                    }
                                    newElement = new Vowel(Vowels.Shuruk);
                                }
                                else
                                {
                                    newElement = new Consonant(Consonants.Vav);
                                }
                                break;
                            }
                        }
                        if ((newElement is Consonant) && (lastTag != null))
                        {
                            if ((lastTag.Tag & TagTypes.Origin) == TagTypes.Foreign)
                            {
                                newElement = new Consonant(Consonants.W);
                            }
                        }
                        break;

                    case 'ז':
                        if (l.HasApostrophe)
                        {
                            newElement = new Consonant(Consonants.Zhayin);
                        }
                        else
                        {
                            newElement = new Consonant(Consonants.Zayin);
                        }
                        break;

                    case 'ח':
                        if (l.HasApostrophe)
                        {
                            newElement = new Consonant(Consonants.Khaf);
                        }
                        else
                        {
                            newElement = new Consonant(Consonants.Het);
                        }
                        break;

                    case 'י':
                        newElement = new Consonant(Consonants.Yud);
                        break;

                    case 'ט':
                        newElement = new Consonant(Consonants.Tet);
                        break;

                    case 'כ':
                    case 'ך':
                        if (l.HasDagesh)
                        {
                            newElement = new Consonant(Consonants.Kaf);
                        }
                        else
                        {
                            newElement = new Consonant(Consonants.Khaf);
                        }
                        break;

                    case 'ל':
                        newElement = new Consonant(Consonants.Lamed);
                        break;

                    case 'מ':
                    case 'ם':
                        newElement = new Consonant(Consonants.Mem);
                        break;

                    case 'נ':
                    case 'ן':
                        newElement = new Consonant(Consonants.Nun);
                        break;

                    case 'ס':
                        newElement = new Consonant(Consonants.Samekh);
                        break;

                    case 'ע':
                        newElement = new Consonant(Consonants.Ayin);
                        break;

                    case 'פ':
                    case 'ף':
                        if (l.HasDagesh)
                        {
                            newElement = new Consonant(Consonants.Pe);
                        }
                        else
                        {
                            newElement = new Consonant(Consonants.Fe);
                        }
                        break;

                    case 'צ':
                    case 'ץ':
                        if (l.HasApostrophe)
                        {
                            newElement = new Consonant(Consonants.Tchaddik);
                        }
                        else
                        {
                            newElement = new Consonant(Consonants.Tsaddik);
                        }
                        break;

                    case 'ק':
                        newElement = new Consonant(Consonants.Quf);
                        break;

                    case 'ר':
                        newElement = new Consonant(Consonants.Resh);
                        break;

                    case 'ש':
                        if (l.HasModifier(HebrewChar.SinDot))
                        {
                            newElement = new Consonant(Consonants.Sin);
                        }
                        else
                        {
                            newElement = new Consonant(Consonants.Shin);
                        }
                        break;

                    case 'ת':
                        newElement = new Consonant(Consonants.Tav);
                        break;
                    }
                    if (newElement is Consonant)
                    {
                        TagTypes wordOrigin = TagTypes.Unrecognized;
                        if (lastTag != null)
                        {
                            wordOrigin = lastTag.Tag & TagTypes.Origin;
                        }

                        Consonant curConsonant = (Consonant)newElement;
                        if (l.HasDagesh)
                        {
                            if (wordOrigin == TagTypes.Foreign)
                            {
                                if (!HebrewChar.IsBegedKefet(l.Letter))
                                {
                                    curConsonant.Flags |= ConsonantFlags.LightDagesh;
                                }
                            }
                            else if (!HebrewChar.IsGuttural(l.Letter) && l.Letter != 'י')
                            {
                                if (!HebrewChar.IsBegedKefet(l.Letter))
                                {
                                    curConsonant.Flags |= ConsonantFlags.StrongDagesh;
                                }
                                else
                                {
                                    if ((prev == null) || (prevVowel == null) || (prevVowel.vowel == Vowels.SilentSchwa))
                                    {
                                        curConsonant.Flags |= ConsonantFlags.LightDagesh;
                                    }
                                    else
                                    {
                                        curConsonant.Flags |= ConsonantFlags.StrongDagesh;
                                    }
                                }
                            }
                        }
                        v = l.FirstVowel;

                        bool patahGnuva = false;

                        if (curIsWordEnd && /*&& (v=='\u05B7')*/
                            ((curConsonant.Latin == Consonants.Het) || (curConsonant.Latin == Consonants.Ayin)))
                        {
                            if ((prevVowel != null) && prevVowel.IsVowelIn(Vowels.E | Vowels.I | Vowels.U | Vowels.O))
                            {
                                if ((v == '\u05B7') || (v == '\0'))
                                {
                                    AddElement(new Vowel(Vowels.PatahGnuva));
                                    patahGnuva = true;
                                }
                            }
                        }

                        if (Options.EverydayRegister)
                        {
                            if ((newElement.Latin == Consonants.Ayin) || (newElement.Latin == Consonants.Aleph) || (newElement.Latin == Consonants.He))
                            {
                                newElement.Silent = true;
                            }
                        }
                        AddElement(newElement);

                        //						Log.Parser.WriteLine("Added consonant "+curConsonant.Latin+" (sonority "+curConsonant.Sonority+")");
                        newElement = null;


                        bool nextIsUnvoicedEhevi = (next != null) &&
                                                   HebrewChar.IsEhevi(next.Letter) &&
                                                   !next.HasAnyVowels &&
                                                   !next.HasMappiq;

                        if (nextIsUnvoicedEhevi)
                        {
                            if (next.Letter == 'ו')
                            {
                                nextIsUnvoicedEhevi &= !l.HasAnyModifier('\u05B7', '\u05B8');
                            }
                            if (further != null)
                            {
                                if (further.Letter == 'ו')
                                {
                                    nextIsUnvoicedEhevi &= !further.HasAnyModifier('\u05B9' /* holam */, HebrewChar.Shuruk);
                                    nextIsUnvoicedEhevi &= !further.HasAnyVowelsExcept(HebrewChar.Shuruk);
                                }
                            }
                            else if (next.Letter == 'י')
                            {
                                nextIsUnvoicedEhevi &= (l.HasModifier('\u05B4'));
                            }
                        }
                        bool nextHasHatafKamatz = (next != null) &&
                                                  next.HasModifier('\u05B3');
                        bool nextHasSchwa = (next != null) &&
                                            next.HasModifier('\u05B0');
                        bool nextHasHataf = (next != null) &&
                                            next.HasAnyModifier('\u05B1', '\u05B2', '\u05B3');
                        bool nextIsBegedKefet = (next != null) &&
                                                HebrewChar.IsBegedKefet(next.Letter);

                        /*						if (nextIsUnvoicedEhevi)
                         *  Log.Parser.WriteLine("Next token is an extender אהו\"י");*/
                        i++;
                        switch (v)
                        {
                        case '\u05B0':
                            if (wordOrigin == TagTypes.Foreign)
                            {
                                newElement = new Vowel(Vowels.SilentSchwa);
                            }
                            else if (prev == null)
                            {
                                newElement = new Vowel(Vowels.AudibleSchwa);
                            }
                            else if (next == null)
                            {
                                newElement = new Vowel(Vowels.SilentSchwa);
                            }
                            else if (nextHasSchwa | nextHasHataf)
                            {
                                newElement = new Vowel(Vowels.SilentSchwa);
                            }
                            else if (nextIsBegedKefet)
                            {
                                if (next.HasDagesh)
                                {
                                    newElement = new Vowel(Vowels.SilentSchwa);
                                }
                                else
                                {
                                    newElement = new Vowel(Vowels.AudibleSchwa);
                                }
                            }

                            /*								else if (((curConsonant.Latin)==Consonants.Aleph) ||
                             *       ((curConsonant.Latin)==Consonants.Ayin) ||
                             *       ((curConsonant.Latin)==Consonants.Het) ||
                             *       ((curConsonant.Latin)==Consonants.He) ||
                             *       ((curConsonant.Latin)==Consonants.Resh))
                             *  newElement=new Vowel(Vowels.AudibleSchwa);*/
                            else if (prevVowel != null)
                            {
                                switch (prevVowel.vowel)
                                {
                                case Vowels.SilentSchwa:
                                    newElement = new Vowel(Vowels.AudibleSchwa);
                                    break;

                                default:
                                    if (prevVowel.IsVowelIn(Vowels.Short))
                                    {
                                        newElement = new Vowel(Vowels.SilentSchwa);
                                    }
                                    else if ((curConsonant.Flags & ConsonantFlags.StrongDagesh) != 0)
                                    {
                                        newElement = new Vowel(Vowels.AudibleSchwa);
                                    }
                                    //				else if (prevVowel.IsVowelIn(Vowels.Long|Vowels.VeryLong))
                                    //					newElement=new Vowel(Vowels.AudibleSchwa);
                                    else
                                    {
                                        newElement = new Vowel(Vowels.SilentSchwa);
                                    }
                                    break;
                                }
                            }
                            else if ((curConsonant.Flags & ConsonantFlags.StrongDagesh) != 0)
                            {
                                newElement = new Vowel(Vowels.AudibleSchwa);
                            }
                            else
                            {
                                newElement = new Vowel(Vowels.SilentSchwa);
                            }
                            break;

                        case '\u05B1':
                            newElement = new Vowel(Vowels.HatafSegol);
                            break;

                        case '\u05B2':
                            newElement = new Vowel(Vowels.HatafPatah);
                            break;

                        case '\u05B3':
                            newElement = new Vowel(Vowels.HatafKamatz);
                            break;

                        case '\u05B4':
                            if (nextIsUnvoicedEhevi)
                            {
                                newElement = new Vowel(Vowels.HirikMale);
                            }
                            else
                            {
                                newElement = new Vowel(Vowels.HirikHaser);
                            }
                            break;

                        case '\u05B5':
                            if (nextIsUnvoicedEhevi)
                            {
                                newElement = new Vowel(Vowels.TzereMale);
                            }
                            else
                            {
                                newElement = new Vowel(Vowels.Tzere);
                            }
                            break;

                        case '\u05B6':
                            if (nextIsUnvoicedEhevi)
                            {
                                newElement = new Vowel(Vowels.SegolMale);
                            }
                            else
                            {
                                newElement = new Vowel(Vowels.Segol);
                            }
                            break;

                        case '\u05B7':
                            if (!patahGnuva)
                            {
                                if (nextIsUnvoicedEhevi)
                                {
                                    newElement = new Vowel(Vowels.PatahMale);
                                }
                                else
                                {
                                    newElement = new Vowel(Vowels.Patah);
                                }
                            }
                            break;

                        case '\u05B8':
                            if (nextIsUnvoicedEhevi)
                            {
                                newElement = new Vowel(Vowels.KamatzMale);
                            }
                            else if (nextHasHatafKamatz)
                            {
                                newElement = new Vowel(Vowels.KamatzKatan);
                            }
                            else
                            {
                                newElement = new Vowel(Vowels.KamatzIndeterminate);
                            }
                            break;

                        case '\u05B9':
                            if (nextIsUnvoicedEhevi)
                            {
                                newElement = new Vowel(Vowels.HolamMale);
                            }
                            else
                            {
                                newElement = new Vowel(Vowels.HolamHaser);
                            }
                            break;

                        case '\u05BB':
                            newElement = new Vowel(Vowels.Kubutz);
                            break;

                        default:
                            //								if (v!=(char)0)
                            //									Log.Parser.WriteLine("Unknown vowel char: {0:X4}",(int)v);
                            break;
                        }
                        prev = l;
                        if (newElement != null)
                        {
                            /*if (curIsWordEnd && (((Vowel)newElement).vowel==Vowels.Patah)
                             *  && ((curConsonant.Latin==Consonants.Het)||(curConsonant.Latin==Consonants.Ayin)||(curConsonant.Latin==Consonants.He))) {
                             *  ((Vowel)newElement).vowel=Vowels.PatahGnuva;
                             *  parsed.Insert(parsed.Count-1,newElement);
                             *  Log.Parser.WriteLine("Added element "+((Vowel)newElement).vowel+" as patah gnuva");
                             * }
                             * else {*/
                            AddElement(newElement);
                            //								Log.Parser.WriteLine("Added element "+((Vowel)newElement).vowel);
                            //}
                            newElement = null;
                            if (nextIsUnvoicedEhevi)
                            {
                                Log("UNVOICED EHEVI FOR CRYING OUT LOUD>>>>>>>>>>>>>");
                                for (int k = i; k < nextIndex; k++)
                                {
                                    Token tk = tokens[k];
                                    if (tk is CantillationToken)
                                    {
                                        newElement = new Cantillation((tk as CantillationToken).Value[0]);
                                        AddElement(newElement);
                                        Log("Added element " + newElement.Latin + " (" + newElement.GetType().Name + ") while skipping unvoiced ehevi");
                                        newElement = null;
                                    }
                                }
                                lock (InQueue)
                                    for (int z = 0; z < nextIndex + 1 - tokensToConsume; z++)
                                    {
                                        _ItemConsumed(InQueue.Dequeue());
                                    }
                                i = nextIndex + 1;
                            }
                        }

                        /*else if (i<tokensToConsume) {
                         *  t=tokens[i];
                         *  if (t is LetterToken) {
                         *      l=(LetterToken)t;
                         *      if (l.Letter=='ו') {
                         *
                         *          if (newElement!=null) {
                         *              AddElement(newElement);
                         *              newElement=null;
                         *              i++;
                         *          }
                         *      }
                         *  }
                         * }*/
                    }
                    else// if (newElement!=null)
                    {
                        i++;
                    }
                }
                if (newElement != null)
                {
                    AddElement(newElement);
                    if (newElement is WordTag)
                    {
                        //						Log.Parser.WriteLine("Added tag "+((WordTag)newElement).Tag);
                        lastTag = (WordTag)newElement;
                    }
                    else
                    {
                        if (newElement is Separator)
                        {
                            lastTag = null;
                        }
                        //						Log.Parser.WriteLine("Added element "+newElement.Latin+" ("+newElement.GetType().Name+")");
                    }
                    newElement = null;
                }
            }
            if (isFirstWindow)
            {
                isFirstWindow = false;
            }
        }