Exemplo n.º 1
0
        public List <PrepositionalPhrase> ProcessPrepositionalPhrases(string[] partsWithPreps)
        {
            List <PrepositionalPhrase> prepositionalChain = new List <PrepositionalPhrase>();

            foreach (string partsWithPrep in partsWithPreps)
            {
                if (partsWithPrep.ContainsCheck("~")) //Is it really?
                {
                    TokenParserUtils pu          = new TokenParserUtils();
                    string           preposition = pu.WordsPunctuationAndCompounds(partsWithPrep)[0];
                    string           tail        = partsWithPrep.Replace(preposition, "").Trim();
                    //These chains are ordered.
                    //kepeken x lon y kepeken z lon a   NOT EQUAL TO kepeken x  kepeken z lon a lon y
                    //Maybe.
                    prepositionalChain.Add(new PrepositionalPhrase(new Word(preposition), String.IsNullOrEmpty(tail) ? null : ProcessEnPiChain(tail)));
                }
                else
                {
                    //Is that surprising?
                    //The first part is not a prep phrase, it is a verb intr or predicate.
                    // throw new TpParseException("Why doesn't a part with Prep contain a ~?");
                }
            }
            return(prepositionalChain);
        }
        public static decimal PercentTokiPona(string sentence)
        {
            if (string.IsNullOrWhiteSpace(sentence))
            {
                return(1);                                     //Blank is fine!
            }
            TokenParserUtils pu = new TokenParserUtils();

            Token[] tokens = pu.ValidTokens(sentence);
            Word    w      = new Word();
            int     bad    = 0;

            foreach (Token token in tokens)
            {
                string unpunctuated = token.Text.Trim(ExtraneousPunctuation);

                string[] errors = w.ValidateOnConstruction(unpunctuated, false);
                if (errors.Length > 0)
                {
                    bad++;
                }
            }
            if (tokens.Length == 0)
            {
                return(1);                   //Must be punctuation, or blank or something.
            }
            return(((decimal)tokens.Length - (decimal)bad) / (decimal)tokens.Length);
        }
        public static WordSet Parse(object value)
        {
            TokenParserUtils pu = new TokenParserUtils();

            Word[]  words   = pu.ValidWords(value.ToString()); //Can't be particles
            WordSet wordSet = new WordSet(words);

            return(wordSet);
        }
Exemplo n.º 4
0
        public static Word Parse(string value, IFormatProvider provider)
        {
            if (string.IsNullOrEmpty(value))
            {
                throw new ArgumentException("value is null or zero length string");
            }
            if (provider == null)
            {
                throw new ArgumentNullException("provider", "IFormatProvider cannot be null");
            }
            Dialect c = provider.GetFormat(typeof(Punctuation)) as Dialect;

            TokenParserUtils pu = new TokenParserUtils();

            string[] possibleWords = pu.JustTpWords(value);
            return(new Word(possibleWords[0], provider));
        }
Exemplo n.º 5
0
        public static bool CheckIsValidPhonology(string value)
        {
            if (string.IsNullOrWhiteSpace(value))
            {
                return(false);
            }
            //Letters
            if (!ValidateLetterSet(value))
            {
                return(false);
            }

            //Invalid syllables.: "ji", "wu", "wo", "ti", "nm", "nn"
            string toLower = value.ToLower();

            foreach (string invalid in new[] { "ji", "wu", "wo", "ti", "nm", "nn" })
            {
                if (toLower.ContainsCheck(invalid))
                {
                    return(false);
                }
            }

            //Casing
            //CVCVN
            TokenParserUtils pu = new TokenParserUtils();

            //Full regex.
            string[] matches = pu.JustTpWords(value);

            //if (matches.Length < 1 && !matches.Contains(value))
            //{
            //    Console.WriteLine(String.Join(", ", matches));
            //}


            return(matches.Length >= 1 && matches.Contains(value));
        }
Exemplo n.º 6
0
        /// <summary>
        /// Parses simple headed phrases fine. Parses some headed phrases with PP modifiers, but
        /// not if the PP is in maximal form.
        /// </summary>
        /// <param name="value"></param>
        /// <returns></returns>
        public HeadedPhrase HeadedPhraseParser(string value)
        {
            if (String.IsNullOrEmpty(value))
            {
                throw new ArgumentException("Impossible to parse a null or zero length string.");
            }
            //#if DEBUG
            //            string copyValue = String.Copy(value);
            //#endif
            if (memoize)
            {
                if (headedPhraseParserMemo.ContainsKey(value))
                {
                    return(headedPhraseParserMemo[value]);
                }
            }


            foreach (string particle in new[] { "pi", "la", "e", "li" })
            {
                if (value.StartsOrContainsOrEnds(particle))
                {
                    throw new TpSyntaxException("Headed phrases have no particles. This one has " + particle + " ref: " + value + " (Did we forget the li between the subject and verb?)");
                }
            }

            PrepositionalPhrase[] pp = null;
            if (value.ContainsCheck("~"))
            {
                string[] headAndPreps = Splitters.SplitOnPrepositions(value);
                value = headAndPreps[0];
                pp    = ProcessPrepositionalPhrases(ArrayExtensions.Tail(headAndPreps)).ToArray();
            }
            //No Pi!
            TokenParserUtils pu = new TokenParserUtils();

            Word[] words = pu.ValidWords(value);


            if (words.Length == 0)
            {
                throw new TpParseException("Failed to parse: " + value);
            }
            //Word head = words[0];
            Word[] tail = words;//ArrayExtensions.Tail(words);

            //EXTRACT MORPHOLOGICAL STRUCTURE OF MODIFIERS HERE.

            var mergedTail = TurnThisWordsIntoWordsWithTaggedWords(tail);


            HeadedPhrase phrase = new HeadedPhrase(mergedTail[0],
                                                   new WordSet(ArrayExtensions.Tail(mergedTail.ToArray())),
                                                   pp);

            //#if DEBUG
            //            if (copyValue != value)
            //            {
            //                throw new TpParseException("Invariant violation: " + copyValue +" --- "+ value);
            //            }
            //#endif
            //            if (memoize)
            //            {
            //                headedPhraseParserMemo[value] = phrase;
            //            }
            return(phrase);
        }
Exemplo n.º 7
0
        // jan li jo e soweli e kili e wawa lon anpa tawa anpa
        //     li jo e soweli e kili e wawa lon anpa tawa anpa
        public TpPredicate ProcessPredicates(string liPart)
        {
            if (String.IsNullOrWhiteSpace(liPart))
            {
                throw new TpParseException("Missing argument, cannot continue");
            }
            if (liPart == "li")
            {
                throw new TpParseException("Cannot do anything with just li");
            }
            TokenParserUtils pu = new TokenParserUtils();
            Particle         verbPhraseParticle;
            ComplexChain     directObjectChain = null;
            VerbPhrase       verbPhrase        = null;

            PrepositionalPhrase[] prepositionalChain = null;
            ComplexChain          nominalPredicate   = null;
            PiPredicate           piPredicate        = null;

            //Transitive Path.
            if (liPart.Split(new[] { ' ', '\t' }).Contains("e"))
            {
                string[] eParts = Splitters.SplitOnE(liPart);

                string[] verbPhraseParts = pu.WordsPunctuationAndCompounds(eParts[0]); //Could contain particles.

                if (!Token.CheckIsParticle(verbPhraseParts[0]))
                {
                    throw new TpSyntaxException("uh-oh not a particle: " + verbPhraseParts[0] + " from " + liPart);
                }
                verbPhraseParticle = new Particle(verbPhraseParts[0]);

                //Only process preps in normalized sentences
                string[] partsWithPreps = null;

                if (verbPhraseParts.Length > 1)
                {
                    if (verbPhraseParts.Any(x => x == "pi"))
                    {
                        //nominal predicate
                        nominalPredicate =
                            new ComplexChain(Particles.en,
                                             new[] {
                            ProcessPiChain(String.Join(" ", ArrayExtensions.Tail(verbPhraseParts)))
                        });
                    }
                    else
                    {
                        verbPhrase = VerbPhraseParser(ArrayExtensions.Tail(verbPhraseParts));
                    }
                }

                string verbsMaybePrepositions = eParts[eParts.Length - 1];


                if (verbsMaybePrepositions.ContainsCheck("~"))
                {
                    partsWithPreps = Splitters.SplitOnPrepositions(verbsMaybePrepositions);
                    if (partsWithPreps.Length == 1)
                    {
                        //This is the last e phrase or 1st prep.
                        if (partsWithPreps[0].ContainsCheck("~"))
                        {
                            //That is a prep phrase (is this possible?)
                        }
                        else
                        {
                            eParts[eParts.Length - 1] = partsWithPreps[0];
                            //No prep phrases.
                        }
                    }
                }

                string[] directObjects = ArrayExtensions.Tail(eParts);

                //List<HeadedPhrase> doNPs = new List<HeadedPhrase>();
                List <Chain> doPiChains = new List <Chain>();

                //Fancy foot work for when we have e ... ~... & that's all.
                string[] toUse;
                if (partsWithPreps != null)
                {
                    toUse = partsWithPreps.Where(x => x.StartCheck("e ")).ToArray();
                    directObjects[directObjects.Length - 1] = toUse[0];
                    toUse = directObjects;
                }
                else
                {
                    toUse = directObjects;
                }

                foreach (string directObject in toUse)
                {
                    if (directObject.Length <= 2)
                    {
                        throw new TpParseException("This is a degenerate e phrase, i.e. it is only e or e space. Missing a ni, e.g. e ni: possibly. ref: " + liPart);
                    }
                    string eFree  = directObject.Substring(2);
                    Chain  phrase = ProcessPiChain(eFree);
                    doPiChains.Add(phrase);
                }
                directObjectChain = new ComplexChain(Particles.e, doPiChains.ToArray());

                if (partsWithPreps != null)
                {
                    prepositionalChain = ProcessPrepositionalPhrases(partsWithPreps).ToArray();
                }
            }
            else
            {
                //Intransitives & Predictates

                string[] ppParts = Splitters.SplitOnPrepositions(liPart);

                if (ppParts.Length == 0) //Excect at least "li verb" or "li noun"
                {
                    throw new TpParseException("Whoa, got " + ppParts.Length + " parts for " + liPart);
                }

                if (Punctuation.ContainsPunctuation(ppParts[0]))
                {
                    throw new TpParseException("This has punctuation, may fail to parse : " + ppParts[0]);
                }
                string[] verbPhraseParts = pu.WordsPunctuationAndCompounds(ppParts[0]);

                if (!Token.CheckIsParticle(verbPhraseParts[0]))
                {
                    throw new TpSyntaxException("uh-oh not a particle: " + verbPhraseParts[0] + " from " + liPart);
                }
                verbPhraseParticle = new Particle(verbPhraseParts[0]);


                if (verbPhraseParts.Length > 1)
                {
                    //0:li 1:xxx 2:np...
                    if (verbPhraseParts[1].ContainsCheck("XXXXZiXXXX"))
                    {
                        //Make it go away. Confuses other parsers and will be picked up by container object.
                        verbPhraseParts = ArrayExtensions.Tail(verbPhraseParts);

                        //piPredicate
                        ComplexChain phrase = new ComplexChain(Particles.en,
                                                               new[] {
                            ProcessPiChain(String.Join(" ", ArrayExtensions.Tail(verbPhraseParts)))
                        });

                        piPredicate = new PiPredicate(Particles.pi, phrase);
                    }
                    else if (verbPhraseParts.Any(x => x == "pi"))
                    {
                        //nominal predicate
                        nominalPredicate = new ComplexChain(Particles.en,
                                                            new[] {
                            ProcessPiChain(String.Join(" ", ArrayExtensions.Tail(verbPhraseParts)))
                        }
                                                            );
                    }
                    else
                    {
                        verbPhrase = VerbPhraseParser(ArrayExtensions.Tail(verbPhraseParts));
                    }
                }


                string[] prepositions = ArrayExtensions.Tail(ppParts);

                if (prepositions.Length != 0)
                {
                    List <PrepositionalPhrase> pChains = new List <PrepositionalPhrase>();
                    foreach (string pp in prepositions)
                    {
                        string[] phraseParts = pu.WordsPunctuationAndCompounds(pp);//Could contain particles.
                        string   preposition = phraseParts[0];
                        string[] tail        = ArrayExtensions.Tail(phraseParts);

                        if (tail.Length == 0)
                        {
                            //uh oh. This is an intransitive verb, like "ni li lon"
                            //HACK: Oh, this is so ugly (still sort of ugly)
                            verbPhrase = new VerbPhrase(new Word(preposition.Replace("~", "")));
                            //or a noun phrase.

                            continue;
                        }

                        PrepositionalPhrase foundPrepositionalPhrase = new PrepositionalPhrase(new Word(preposition), ProcessEnPiChain(String.Join(" ", tail)));
                        pChains.Add(foundPrepositionalPhrase);
                    }
                    if (pChains.Count > 0)
                    {
                        prepositionalChain = pChains.ToArray();
                    }
                    else
                    {
                        //We changed our mind about a phrase being a prep phrase. Turned out to be verb phrase or predicate.
                    }
                }
            }
            if (piPredicate != null)
            {
                return(new TpPredicate(verbPhraseParticle, piPredicate, prepositionalChain));
            }
            if (nominalPredicate == null)
            {
                return(new TpPredicate(verbPhraseParticle, verbPhrase, directObjectChain, prepositionalChain));
            }

            return(new TpPredicate(verbPhraseParticle, nominalPredicate, directObjectChain, prepositionalChain));
        }
Exemplo n.º 8
0
        public Sentence ProcessSimpleSentence(string sentence, Punctuation punctuation, string original)
        {
            //Think this is causing a bug.
            ////HACK: Still need a better way to deal with quotes.
            //if (sentence.EndCheck("»") || sentence.EndCheck("«"))
            //{
            //    sentence = sentence.Substring(0, sentence.Length - 1);
            //}



            //Comment? Get out of here!
            if (sentence.StartCheck("///"))
            {
                Comment c = new Comment(sentence);
                return(new Sentence(c, diagnostics));
            }

            //Simple exclamation! Get out of here!
            if (Exclamation.IsExclamation(sentence))
            {
                return(new Sentence(new Exclamation(new HeadedPhrase(new Word(sentence))), punctuation, new SentenceDiagnostics(original, sentence)));
            }

            List <Vocative> headVocatives = null;

            //jan Mato o, ale li pona. Head vocative!
            //kin la o moku. //not a vocative (hopefully dealt with elsewhere)
            //jan Mato o moku! //Head vocative, & imperative, with 2nd o discarded
            //jan Mato o o moku! //Head vocative, & imperative, with 2nd o discarded


            if (sentence.ContainsCheck(" o o "))//Explicit vocative & imperative
            {
                //Okay, we know exactly when the head vocatives end.
                headVocatives = new List <Vocative>();
                string justHeadVocatives = sentence.Substring(0, sentence.IndexOf(" o o ", StringComparison.Ordinal));

                //Process head vocatives.
                ProcessHeadVocatives(Splitters.SplitOnO(justHeadVocatives), headVocatives, allAreVocatives: true);
                //BUG: Add the dummy! (And it still doesn't work!)
                sentence = "jan Sanwan o " + sentence.Substring(sentence.IndexOf(" o o ", StringComparison.Ordinal) + 5);
            }


            //Starts with o, then we have imperative & no head vocatives.
            bool endsOrStartsWithO = sentence.StartCheck("o ") && sentence.EndCheck(" o");

            if (!endsOrStartsWithO)
            {
                //jan So o! (We already deal with degenerate vocative sentences elsewhere)
                //jan So o sina li nasa.
                //jan So o nasa!
                //jan So o mi mute o nasa.  <-- This is the problem.

                //These could be vocatives or imperatives.
                if (sentence.ContainsCheck(" o ", " o,", ",o ") && sentence.ContainsCheck(" li "))
                {
                    headVocatives = new List <Vocative>();

                    ProcessHeadVocatives(Splitters.SplitOnO(sentence), headVocatives, allAreVocatives: false);

                    //int firstLi = sentence.IndexOf(" li ");
                    int lastO = sentence.LastIndexOf(" o ", StringComparison.Ordinal);
                    if (lastO < 0)
                    {
                        lastO = sentence.LastIndexOf(" o,", StringComparison.Ordinal);
                    }

                    sentence = sentence.Substring(lastO + 2);
                }
            }

            //Process tag conjunctions and tag questions
            Particle    conjunction = null;
            TagQuestion tagQuestion = null;

            if (sentence.StartCheck("taso "))
            {
                conjunction = Particles.taso;
                sentence    = sentence.Substring(5);
            }
            else if (sentence.StartCheck("anu "))
            {
                conjunction = Particles.anu;
                sentence    = sentence.Substring(4);
            }
            else if (sentence.StartCheck("en "))
            {
                //Well, either parse it or throw. Otherwise, this gets skipped.
                //is this legal?
                conjunction = Particles.en;
                sentence    = sentence.Substring(3);
            }
            else if (sentence.StartCheck("ante ")) //never seen it.
            {
                conjunction = Particles.ante;
                sentence    = sentence.Substring(5);
            }

            //Should already have ? stripped off
            if (sentence.EndsWith(" anu seme"))
            {
                tagQuestion = new TagQuestion();
                sentence    = sentence.Substring(0, sentence.LastIndexOf(" anu seme", StringComparison.Ordinal));
            }


            if (sentence.EndCheck(" li"))
            {
                throw new TpParseException("Something went wrong-- sentenc ends with li. " + sentence);
            }
            if (sentence.StartsOrContainsOrEnds("la"))
            {
                throw new TpParseException("If it contains a la, anywhere, it isn't a simple sentence. " + sentence);
            }

            bool isHortative  = false;
            bool isImperative = false;

            if (sentence.StartCheck("o ") && sentence.ContainsCheck(" li "))
            {
                //o mi mute li moku
                isHortative = true;
                sentence    = sentence.RemoveLeadingWholeWord("o");
            }
            if (sentence.StartCheck("o ") && !sentence.ContainsCheck(" li "))
            {
                //o pana e pan
                isImperative = true;
                //sentence = sentence.RemoveLeadingWholeWord("o");
            }
            // someting o ==> vocative

            string[] liParts = Splitters.SplitOnLiOrO(sentence);

            if (liParts.Length == 1 && Exclamation.IsExclamation(liParts[0]))
            {
                //HACK: Duplicate code. & it only deals with a single final puncution mark.
                string possiblePunctuation = sentence[sentence.Length - 1].ToString();
                if (Punctuation.TryParse(possiblePunctuation, out punctuation))
                {
                    sentence = sentence.Substring(0, sentence.Length - 1);
                }

                //The whole thing is o! (or pakala! or the like)
                //pona a! a a a! ike a!
                TokenParserUtils tpu = new TokenParserUtils();

                Word[]       tokes         = tpu.ValidWords(sentence);
                HeadedPhrase parts         = new HeadedPhrase(tokes[0], new WordSet(ArrayExtensions.Tail(tokes)));
                bool         modifiersAreA = true;

                foreach (Word w in parts.Modifiers)
                {
                    if (w == "a")
                    {
                        continue;           //peculiar to exclamations & repeats.
                    }
                    if (w == "kin")
                    {
                        continue;             //modifies just about anything
                    }
                    modifiersAreA = false;
                }

                if (modifiersAreA)
                {
                    Exclamation exclamation = new Exclamation(parts);
                    Sentence    s           = new Sentence(exclamation, punctuation, diagnostics);
                    return(s);
                }
            }


            //Degenerate sentences.
            if (liParts[liParts.Length - 1].Trim(new char[] { ',', '«', '»', '!', ' ' }) == "o")
            {
                //We have a vocative sentence...
                Vocative vocative = new Vocative(ProcessEnPiChain(liParts[0]));
                Sentence s        = new Sentence(vocative, punctuation, diagnostics);
                return(s);
            }

            string subjects = liParts[0].Trim();

            ComplexChain subjectChain = null;
            int          startAt      = 1; //slot 0 is normally a subject

            if (subjects.Contains("«"))
            {
                int foo = 3;
            }
            if (subjects.StartCheck("o ") ||
                subjects.StartCheck("«o "))
            {
                //This is a verb phrase with implicit subjects!
                startAt = 0;
            }
            else
            {
                subjectChain = ProcessEnPiChain(subjects);
            }

            PredicateList verbPhrases = new PredicateList();

            for (int i = startAt; i < liParts.Length; i++)
            {
                string predicate = liParts[i].Trim();

                verbPhrases.Add(ProcessPredicates(predicate));
            }

            //Head or complete sentence.

            Sentence parsedSentence = new Sentence(subjectChain, verbPhrases, diagnostics, new SentenceOptionalParts
            {
                Conjunction = conjunction,
                //Etc
                Punctuation   = punctuation,
                IsHortative   = isHortative,
                TagQuestion   = tagQuestion,
                HeadVocatives = headVocatives != null ? headVocatives.ToArray() : null
            });

            return(parsedSentence);
        }
        public void NewThings()
        {
            string[] samples =
                new string[]
            {
                CorpusTexts.ProfesorAndMadMan,
                CorpusTexts.UnpaText,
                CorpusTexts.Gilgamesh,
                CorpusTexts.SampleText1,
                CorpusTexts.SampleText3,
                CorpusTexts.Lao,
                CorpusTexts.GeorgeSong,
                CorpusTexts.CrazyAnimal,
                CorpusTexts.CrazyAnimal2
                //,CorpusTexts.JanSin  //Too many neologisms to cope.
                , CorpusTexts.RuneDanceSong
                , CorpusTexts.janPusaRice
                , CorpusTexts.janPend
            };

            foreach (string sample in samples)
            {
                //Split, normalize, tokenize, find words.
                TokenParserUtils            tp    = new TokenParserUtils();
                Dictionary <string, string> stuff = new Dictionary <string, string>();
                foreach (Token toke in tp.ValidTokens(sample).Distinct())
                {
                    if (toke.CheckIsCompoundWord(toke.Text))
                    {
                        if (!stuff.ContainsKey(toke.Text))
                        {
                            stuff.Add(toke.Text, "Compound");
                        }
                    }
                    if (toke.Text.StartCheck("#") && toke.CheckIsNumber(toke.Text))
                    {
                        //Should just have to verify we can parse. No need for dictionary.
                        if (!stuff.ContainsKey(toke.Text))
                        {
                            stuff.Add(toke.Text, "Number");
                        }
                    }
                    if (toke.CheckIsProperModifier(toke.Text))
                    {
                        if (!stuff.ContainsKey(toke.Text))
                        {
                            stuff.Add(toke.Text, "Proper");
                        }
                    }
                    if (ForeignWord.IsForeign(toke.Text))
                    {
                        if (!stuff.ContainsKey(toke.Text))
                        {
                            stuff.Add(toke.Text, "Proper");
                        }
                    }

                    if (Neologism.IsNeologism(toke.Text))
                    {
                        if (!stuff.ContainsKey(toke.Text))
                        {
                            stuff.Add(toke.Text, "Neologism");
                        }
                    }
                }

                foreach (var t in stuff)
                {
                    Console.WriteLine(t.Value + " " + t.Key);
                }
            }
        }