protected override void BeforeConsumption() { base.BeforeConsumption(); Log(LogLevel.MajorInfo, "Started"); t = null; newElement = null; lastTag = null; prev = null; isFirstWindow = true; isLastWindow = false; prevVowel = null; // prevConsonant = null; prevElement = null; }
protected override void BeforeConsumption() { base.BeforeConsumption(); Console.WriteLine("parser started"); t = null; newElement = null; lastTag = null; prev = null; isFirstWindow = true; isLastWindow = false; prevVowel = null; prevConsonant = null; prevElement = null; }
protected override void AfterConsumption() { base.AfterConsumption(); switch (state) { case TokenState.Neutral: if (sb.Length > 0) { Token t = new Token(sb.ToString()); this.Emit(new Token(sb.ToString())); Console.WriteLine("tokenizer: Ate neutral token " + t.Value); } break; case TokenState.Letter: LetterToken lt = new LetterToken(letter, sb.ToString()); this.Emit(lt); Console.WriteLine("tokenizer: Ate letter " + lt.Value); break; } _DoneProducing(); Console.WriteLine("tokenizer finished"); }
protected override void AfterConsumption() { switch (state) { case TokenState.Neutral: if (sb.Length > 0) { Token t = new Token(sb.ToString()); Log("Producing neutral token " + t.Value); this.Emit(t); } break; case TokenState.Letter: LetterToken lt = new LetterToken(letter, sb.ToString()); Log("Producing letter " + lt.Value); this.Emit(lt); break; } Log(LogLevel.MajorInfo, "Finished"); base.AfterConsumption(); _DoneProducing(); }
protected override void Consume(Queue <char> InQueue) { char c = InQueue.Dequeue(); _ItemConsumed(c); // Console.WriteLine("Tokenizing input..."); if (state == TokenState.Letter) { if (HebrewChar.IsModifier(c)) { sb.Append(c); return; } else { LetterToken lt = new LetterToken(letter, sb.ToString()); this.Emit(lt); Console.WriteLine("tokenizer: Ate letter " + lt.Value); sb.Length = 0; state = TokenState.Neutral; } } switch (state) { case TokenState.Tag: if (c == '/') { state = TokenState.Neutral; TagToken tt = new TagToken(sb.ToString()); this.Emit(tt); Console.WriteLine("tokenizer: Ate tag " + tt.Type); sb.Length = 0; } else { sb.Append(c); } break; case TokenState.Neutral: if (HebrewChar.IsLetter(c) || (c == '/')) { if (sb.Length > 0) { Token t = new Token(sb.ToString()); this.Emit(t); Console.WriteLine("tokenizer: Ate neutral token " + t.Value); } if (c == '/') { state = TokenState.Tag; } else { letter = c; state = TokenState.Letter; } sb.Length = 0; } else if (HebrewChar.IsCantillation(c)) { CantillationToken ct = new CantillationToken(c.ToString()); this.Emit(ct); Console.WriteLine("tokenizer: Ate cantillation mark " + ct.Value); } else if (HebrewChar.IsPunctuation(c)) { PunctuationToken pt = new PunctuationToken(c.ToString()); this.Emit(pt); Console.WriteLine("tokenizer: Ate punctuation " + pt.Value); } else { sb.Append(c); } break; } }
protected override void Consume(Queue <Token> InQueue) { Token[] tokens; lock (InQueue) { tokens = InQueue.ToArray(); InQueue.Dequeue(); } int tokensToConsume = 1; if (tokensToConsume > windowSize) { tokensToConsume -= windowSize; } else { isLastWindow = !this.IsRunning; } for (int i = 0; i < tokensToConsume;) { newElement = null; t = tokens[i]; _ItemConsumed(t); if (!(t is LetterToken)) { if (t is TagToken) { newElement = new WordTag(((TagToken)t).Type); prev = null; i++; } else if (t is CantillationToken) { newElement = new Cantillation((t as CantillationToken).Value[0]); i++; } else { newElement = new Separator(t.Value); prev = null; i++; } } else { LetterToken next = null; LetterToken further = null; int nextIndex = -1 /*, furtherIndex = -1*/; int j; /*j=i-1; * while ((j>0)&&!(tokens[j] is LetterToken)) { * if (!(tokens[j] is CantillationToken)) * break; * j--; * } * if (j>=0) { * prev=tokens[j] as LetterToken; * prevIndex=j; * }*/ j = i + 1; while ((j < tokens.Length - 1) && !(tokens[j] is LetterToken)) { if (!(tokens[j] is CantillationToken)) { break; } j++; } if (j < tokens.Length) { next = tokens[j] as LetterToken; nextIndex = j; } j++; while ((j < tokens.Length - 1) && !(tokens[j] is LetterToken)) { if (!(tokens[j] is CantillationToken)) { break; } j++; } if (j < tokens.Length) { further = tokens[j] as LetterToken; // furtherIndex = j; } bool curIsWordEnd = (isLastWindow && (i == tokens.Length - 1)) || (next == null); bool curIsWordStart = (isFirstWindow && (i == 0)) || (prev == null); // Look for a consonant LetterToken l = (LetterToken)t; switch (l.Letter) { case 'א': newElement = new Consonant(Consonants.Aleph); break; case 'ב': if (l.HasDagesh) { newElement = new Consonant(Consonants.Bet); } else { newElement = new Consonant(Consonants.Vet); } break; case 'ג': if (l.HasApostrophe) { newElement = new Consonant(Consonants.Jimmel); } else { newElement = new Consonant(Consonants.Gimmel); } break; case 'ד': newElement = new Consonant(Consonants.Dalet); break; case 'ה': newElement = new Consonant(Consonants.He); break; case 'ו': if ((l.HasDagesh && l.HasAnyVowels) || l.HasAnyVowelsExcept('\u05B9')) { newElement = new Consonant(Consonants.Vav); } else if ((next != null) && (next.Letter == 'ו') && (next.HasAnyModifier('\u05B9', HebrewChar.Shuruk))) { newElement = new Consonant(Consonants.Vav); } else { v = l.FirstVowel; switch (v) { case '\u05B9': newElement = new Vowel(Vowels.HolamMale); break; case '\0': if (l.HasShuruk) { if (curIsWordStart) { AddElement(newElement = new Consonant(Consonants.Aleph)); // Log.Parser.WriteLine("Added consonant "+newElement.Latin+" (sonority "+((Consonant)newElement).Sonority+")"); } newElement = new Vowel(Vowels.Shuruk); } else { newElement = new Consonant(Consonants.Vav); } break; } } if ((newElement is Consonant) && (lastTag != null)) { if ((lastTag.Tag & TagTypes.Origin) == TagTypes.Foreign) { newElement = new Consonant(Consonants.W); } } break; case 'ז': if (l.HasApostrophe) { newElement = new Consonant(Consonants.Zhayin); } else { newElement = new Consonant(Consonants.Zayin); } break; case 'ח': if (l.HasApostrophe) { newElement = new Consonant(Consonants.Khaf); } else { newElement = new Consonant(Consonants.Het); } break; case 'י': newElement = new Consonant(Consonants.Yud); break; case 'ט': newElement = new Consonant(Consonants.Tet); break; case 'כ': case 'ך': if (l.HasDagesh) { newElement = new Consonant(Consonants.Kaf); } else { newElement = new Consonant(Consonants.Khaf); } break; case 'ל': newElement = new Consonant(Consonants.Lamed); break; case 'מ': case 'ם': newElement = new Consonant(Consonants.Mem); break; case 'נ': case 'ן': newElement = new Consonant(Consonants.Nun); break; case 'ס': newElement = new Consonant(Consonants.Samekh); break; case 'ע': newElement = new Consonant(Consonants.Ayin); break; case 'פ': case 'ף': if (l.HasDagesh) { newElement = new Consonant(Consonants.Pe); } else { newElement = new Consonant(Consonants.Fe); } break; case 'צ': case 'ץ': if (l.HasApostrophe) { newElement = new Consonant(Consonants.Tchaddik); } else { newElement = new Consonant(Consonants.Tsaddik); } break; case 'ק': newElement = new Consonant(Consonants.Quf); break; case 'ר': newElement = new Consonant(Consonants.Resh); break; case 'ש': if (l.HasModifier(HebrewChar.SinDot)) { newElement = new Consonant(Consonants.Sin); } else { newElement = new Consonant(Consonants.Shin); } break; case 'ת': newElement = new Consonant(Consonants.Tav); break; } if (newElement is Consonant) { TagTypes wordOrigin = TagTypes.Unrecognized; if (lastTag != null) { wordOrigin = lastTag.Tag & TagTypes.Origin; } Consonant curConsonant = (Consonant)newElement; if (l.HasDagesh) { if (wordOrigin == TagTypes.Foreign) { if (!HebrewChar.IsBegedKefet(l.Letter)) { curConsonant.Flags |= ConsonantFlags.LightDagesh; } } else if (!HebrewChar.IsGuttural(l.Letter) && l.Letter != 'י') { if (!HebrewChar.IsBegedKefet(l.Letter)) { curConsonant.Flags |= ConsonantFlags.StrongDagesh; } else { if ((prev == null) || (prevVowel == null) || (prevVowel.vowel == Vowels.SilentSchwa)) { curConsonant.Flags |= ConsonantFlags.LightDagesh; } else { curConsonant.Flags |= ConsonantFlags.StrongDagesh; } } } } v = l.FirstVowel; bool patahGnuva = false; if (curIsWordEnd && /*&& (v=='\u05B7')*/ ((curConsonant.Latin == Consonants.Het) || (curConsonant.Latin == Consonants.Ayin))) { if ((prevVowel != null) && prevVowel.IsVowelIn(Vowels.E | Vowels.I | Vowels.U | Vowels.O)) { if ((v == '\u05B7') || (v == '\0')) { AddElement(new Vowel(Vowels.PatahGnuva)); patahGnuva = true; } } } if (Options.EverydayRegister) { if ((newElement.Latin == Consonants.Ayin) || (newElement.Latin == Consonants.Aleph) || (newElement.Latin == Consonants.He)) { newElement.Silent = true; } } AddElement(newElement); // Log.Parser.WriteLine("Added consonant "+curConsonant.Latin+" (sonority "+curConsonant.Sonority+")"); newElement = null; bool nextIsUnvoicedEhevi = (next != null) && HebrewChar.IsEhevi(next.Letter) && !next.HasAnyVowels && !next.HasMappiq; if (nextIsUnvoicedEhevi) { if (next.Letter == 'ו') { nextIsUnvoicedEhevi &= !l.HasAnyModifier('\u05B7', '\u05B8'); } if (further != null) { if (further.Letter == 'ו') { nextIsUnvoicedEhevi &= !further.HasAnyModifier('\u05B9' /* holam */, HebrewChar.Shuruk); nextIsUnvoicedEhevi &= !further.HasAnyVowelsExcept(HebrewChar.Shuruk); } } else if (next.Letter == 'י') { nextIsUnvoicedEhevi &= (l.HasModifier('\u05B4')); } } bool nextHasHatafKamatz = (next != null) && next.HasModifier('\u05B3'); bool nextHasSchwa = (next != null) && next.HasModifier('\u05B0'); bool nextHasHataf = (next != null) && next.HasAnyModifier('\u05B1', '\u05B2', '\u05B3'); bool nextIsBegedKefet = (next != null) && HebrewChar.IsBegedKefet(next.Letter); /* if (nextIsUnvoicedEhevi) * Log.Parser.WriteLine("Next token is an extender אהו\"י");*/ i++; switch (v) { case '\u05B0': if (wordOrigin == TagTypes.Foreign) { newElement = new Vowel(Vowels.SilentSchwa); } else if (prev == null) { newElement = new Vowel(Vowels.AudibleSchwa); } else if (next == null) { newElement = new Vowel(Vowels.SilentSchwa); } else if (nextHasSchwa | nextHasHataf) { newElement = new Vowel(Vowels.SilentSchwa); } else if (nextIsBegedKefet) { if (next.HasDagesh) { newElement = new Vowel(Vowels.SilentSchwa); } else { newElement = new Vowel(Vowels.AudibleSchwa); } } /* else if (((curConsonant.Latin)==Consonants.Aleph) || * ((curConsonant.Latin)==Consonants.Ayin) || * ((curConsonant.Latin)==Consonants.Het) || * ((curConsonant.Latin)==Consonants.He) || * ((curConsonant.Latin)==Consonants.Resh)) * newElement=new Vowel(Vowels.AudibleSchwa);*/ else if (prevVowel != null) { switch (prevVowel.vowel) { case Vowels.SilentSchwa: newElement = new Vowel(Vowels.AudibleSchwa); break; default: if (prevVowel.IsVowelIn(Vowels.Short)) { newElement = new Vowel(Vowels.SilentSchwa); } else if ((curConsonant.Flags & ConsonantFlags.StrongDagesh) != 0) { newElement = new Vowel(Vowels.AudibleSchwa); } // else if (prevVowel.IsVowelIn(Vowels.Long|Vowels.VeryLong)) // newElement=new Vowel(Vowels.AudibleSchwa); else { newElement = new Vowel(Vowels.SilentSchwa); } break; } } else if ((curConsonant.Flags & ConsonantFlags.StrongDagesh) != 0) { newElement = new Vowel(Vowels.AudibleSchwa); } else { newElement = new Vowel(Vowels.SilentSchwa); } break; case '\u05B1': newElement = new Vowel(Vowels.HatafSegol); break; case '\u05B2': newElement = new Vowel(Vowels.HatafPatah); break; case '\u05B3': newElement = new Vowel(Vowels.HatafKamatz); break; case '\u05B4': if (nextIsUnvoicedEhevi) { newElement = new Vowel(Vowels.HirikMale); } else { newElement = new Vowel(Vowels.HirikHaser); } break; case '\u05B5': if (nextIsUnvoicedEhevi) { newElement = new Vowel(Vowels.TzereMale); } else { newElement = new Vowel(Vowels.Tzere); } break; case '\u05B6': if (nextIsUnvoicedEhevi) { newElement = new Vowel(Vowels.SegolMale); } else { newElement = new Vowel(Vowels.Segol); } break; case '\u05B7': if (!patahGnuva) { if (nextIsUnvoicedEhevi) { newElement = new Vowel(Vowels.PatahMale); } else { newElement = new Vowel(Vowels.Patah); } } break; case '\u05B8': if (nextIsUnvoicedEhevi) { newElement = new Vowel(Vowels.KamatzMale); } else if (nextHasHatafKamatz) { newElement = new Vowel(Vowels.KamatzKatan); } else { newElement = new Vowel(Vowels.KamatzIndeterminate); } break; case '\u05B9': if (nextIsUnvoicedEhevi) { newElement = new Vowel(Vowels.HolamMale); } else { newElement = new Vowel(Vowels.HolamHaser); } break; case '\u05BB': newElement = new Vowel(Vowels.Kubutz); break; default: // if (v!=(char)0) // Log.Parser.WriteLine("Unknown vowel char: {0:X4}",(int)v); break; } prev = l; if (newElement != null) { /*if (curIsWordEnd && (((Vowel)newElement).vowel==Vowels.Patah) * && ((curConsonant.Latin==Consonants.Het)||(curConsonant.Latin==Consonants.Ayin)||(curConsonant.Latin==Consonants.He))) { * ((Vowel)newElement).vowel=Vowels.PatahGnuva; * parsed.Insert(parsed.Count-1,newElement); * Log.Parser.WriteLine("Added element "+((Vowel)newElement).vowel+" as patah gnuva"); * } * else {*/ AddElement(newElement); // Log.Parser.WriteLine("Added element "+((Vowel)newElement).vowel); //} newElement = null; if (nextIsUnvoicedEhevi) { Log("UNVOICED EHEVI FOR CRYING OUT LOUD>>>>>>>>>>>>>"); for (int k = i; k < nextIndex; k++) { Token tk = tokens[k]; if (tk is CantillationToken) { newElement = new Cantillation((tk as CantillationToken).Value[0]); AddElement(newElement); Log("Added element " + newElement.Latin + " (" + newElement.GetType().Name + ") while skipping unvoiced ehevi"); newElement = null; } } lock (InQueue) for (int z = 0; z < nextIndex + 1 - tokensToConsume; z++) { _ItemConsumed(InQueue.Dequeue()); } i = nextIndex + 1; } } /*else if (i<tokensToConsume) { * t=tokens[i]; * if (t is LetterToken) { * l=(LetterToken)t; * if (l.Letter=='ו') { * * if (newElement!=null) { * AddElement(newElement); * newElement=null; * i++; * } * } * } * }*/ } else// if (newElement!=null) { i++; } } if (newElement != null) { AddElement(newElement); if (newElement is WordTag) { // Log.Parser.WriteLine("Added tag "+((WordTag)newElement).Tag); lastTag = (WordTag)newElement; } else { if (newElement is Separator) { lastTag = null; } // Log.Parser.WriteLine("Added element "+newElement.Latin+" ("+newElement.GetType().Name+")"); } newElement = null; } } if (isFirstWindow) { isFirstWindow = false; } }