Beispiel #1
0
        public static List <BlockTitleToken> TryAttachList(Pullenti.Ner.Token t)
        {
            BlockTitleToken        content = null;
            BlockTitleToken        intro   = null;
            List <BlockTitleToken> lits    = null;

            for (Pullenti.Ner.Token tt = t; tt != null; tt = tt.Next)
            {
                if (tt.IsNewlineBefore)
                {
                    BlockTitleToken btt = TryAttach(tt, false, null);
                    if (btt == null)
                    {
                        continue;
                    }
                    if (btt.Typ == BlkTyps.Index)
                    {
                        content = btt;
                        break;
                    }
                    if (btt.Typ == BlkTyps.Intro)
                    {
                        Pullenti.Ner.Token tt2 = btt.EndToken.Next;
                        for (int k = 0; k < 5; k++)
                        {
                            BlockLine li = BlockLine.Create(tt2, null);
                            if (li == null)
                            {
                                break;
                            }
                            if (li.HasContentItemTail || li.Typ == BlkTyps.IndexItem)
                            {
                                content = btt;
                                break;
                            }
                            if (li.HasVerb)
                            {
                                break;
                            }
                            if (li.Typ != BlkTyps.Undefined)
                            {
                                if ((li.BeginChar - btt.EndChar) < 400)
                                {
                                    content = btt;
                                    break;
                                }
                            }
                            tt2 = li.EndToken.Next;
                        }
                        if (content == null)
                        {
                            intro = btt;
                        }
                        break;
                    }
                    if (btt.Typ == BlkTyps.Literature)
                    {
                        if (lits == null)
                        {
                            lits = new List <BlockTitleToken>();
                        }
                        lits.Add(btt);
                    }
                }
            }
            if (content == null && intro == null && ((lits == null || lits.Count != 1)))
            {
                return(null);
            }
            List <BlockTitleToken> res = new List <BlockTitleToken>();

            Pullenti.Ner.Core.TerminCollection chapterNames = new Pullenti.Ner.Core.TerminCollection();
            Pullenti.Ner.Token t0 = null;
            if (content != null)
            {
                res.Add(content);
                int cou = 0;
                int err = 0;
                for (Pullenti.Ner.Token tt = content.EndToken.Next; tt != null; tt = tt.Next)
                {
                    if (!tt.IsNewlineBefore)
                    {
                        continue;
                    }
                    BlockLine li = BlockLine.Create(tt, null);
                    if (li == null)
                    {
                        break;
                    }
                    if (li.HasVerb)
                    {
                        if (li.EndToken.IsChar('.'))
                        {
                            break;
                        }
                        if (li.LengthChar > 100)
                        {
                            break;
                        }
                    }
                    BlockTitleToken btt = TryAttach(tt, true, null);
                    if (btt == null)
                    {
                        continue;
                    }
                    err = 0;
                    if (btt.Typ == BlkTyps.Intro)
                    {
                        if (content.Typ == BlkTyps.Intro || cou > 2)
                        {
                            break;
                        }
                    }
                    cou++;
                    tt = (content.EndToken = btt.EndToken);
                    if (btt.Value != null)
                    {
                        chapterNames.AddString(btt.Value, null, null, false);
                    }
                }
                content.Typ = BlkTyps.Index;
                t0          = content.EndToken.Next;
            }
            else if (intro != null)
            {
                t0 = intro.BeginToken;
            }
            else if (lits != null)
            {
                t0 = t;
            }
            else
            {
                return(null);
            }
            bool first = true;

            for (Pullenti.Ner.Token tt = t0; tt != null; tt = tt.Next)
            {
                if (!tt.IsNewlineBefore)
                {
                    continue;
                }
                if (tt.IsValue("СЛАБОЕ", null))
                {
                }
                BlockTitleToken btt = TryAttach(tt, false, chapterNames);
                if (btt == null)
                {
                    continue;
                }
                if (res.Count == 104)
                {
                }
                tt = btt.EndToken;
                if (content != null && btt.Typ == BlkTyps.Index)
                {
                    continue;
                }
                if (res.Count > 0 && res[res.Count - 1].Typ == BlkTyps.Literature)
                {
                    if (btt.Typ != BlkTyps.Appendix && btt.Typ != BlkTyps.Misc && btt.Typ != BlkTyps.Literature)
                    {
                        if (btt.Typ == BlkTyps.Chapter && (res[res.Count - 1].EndChar < ((tt.Kit.Sofa.Text.Length * 3) / 4)))
                        {
                        }
                        else
                        {
                            continue;
                        }
                    }
                }
                if (first)
                {
                    if ((tt.BeginChar - t0.BeginChar) > 300)
                    {
                        BlockTitleToken btt0 = new BlockTitleToken(t0, (t0.Previous == null ? t0 : t0.Previous));
                        btt0.Typ   = BlkTyps.Chapter;
                        btt0.Value = "Похоже на начало";
                        res.Add(btt0);
                    }
                }
                res.Add(btt);
                tt    = btt.EndToken;
                first = false;
            }
            for (int i = 0; i < (res.Count - 1); i++)
            {
                if (res[i].Typ == BlkTyps.Literature && res[i + 1].Typ == res[i].Typ)
                {
                    res.RemoveAt(i + 1);
                    i--;
                }
            }
            return(res);
        }
Beispiel #2
0
        public static BlockLine Create(Pullenti.Ner.Token t, Pullenti.Ner.Core.TerminCollection names)
        {
            if (t == null)
            {
                return(null);
            }
            BlockLine res = new BlockLine(t, t);

            for (Pullenti.Ner.Token tt = t; tt != null; tt = tt.Next)
            {
                if (tt != t && tt.IsNewlineBefore)
                {
                    break;
                }
                else
                {
                    res.EndToken = tt;
                }
            }
            int nums = 0;

            while (t != null && t.Next != null && t.EndChar <= res.EndChar)
            {
                if (t is Pullenti.Ner.NumberToken)
                {
                }
                else
                {
                    Pullenti.Ner.NumberToken rom = Pullenti.Ner.Core.NumberHelper.TryParseRoman(t);
                    if (rom != null && rom.EndToken.Next != null)
                    {
                        t = rom.EndToken;
                    }
                    else
                    {
                        break;
                    }
                }
                if (t.Next.IsChar('.'))
                {
                }
                else if ((t.Next is Pullenti.Ner.TextToken) && !t.Next.Chars.IsAllLower)
                {
                }
                else
                {
                    break;
                }
                res.NumberEnd = t;
                t             = t.Next;
                if (t.IsChar('.') && t.Next != null)
                {
                    res.NumberEnd = t;
                    t             = t.Next;
                }
                if (t.IsNewlineBefore)
                {
                    return(res);
                }
                nums++;
            }
            Pullenti.Ner.Core.TerminToken tok = m_Ontology.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No);
            if (tok == null)
            {
                Pullenti.Ner.Core.NounPhraseToken npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null);
                if (npt1 != null && npt1.EndToken != npt1.BeginToken)
                {
                    tok = m_Ontology.TryParse(npt1.Noun.BeginToken, Pullenti.Ner.Core.TerminParseAttr.No);
                }
            }
            if (tok != null)
            {
                if (t.Previous != null && t.Previous.IsChar(':'))
                {
                    tok = null;
                }
            }
            if (tok != null)
            {
                BlkTyps typ = (BlkTyps)tok.Termin.Tag;
                if (typ == BlkTyps.Conslusion)
                {
                    if (t.IsNewlineAfter)
                    {
                    }
                    else if (t.Next != null && t.Next.Morph.Class.IsPreposition && t.Next.Next != null)
                    {
                        Pullenti.Ner.Core.TerminToken tok2 = m_Ontology.TryParse(t.Next.Next, Pullenti.Ner.Core.TerminParseAttr.No);
                        if (tok2 != null && ((BlkTyps)tok2.Termin.Tag) == BlkTyps.Chapter)
                        {
                        }
                        else
                        {
                            tok = null;
                        }
                    }
                    else
                    {
                        tok = null;
                    }
                }
                if (t.Kit.BaseLanguage != t.Morph.Language)
                {
                    tok = null;
                }
                if (typ == BlkTyps.Index && !t.IsValue("ОГЛАВЛЕНИЕ", null))
                {
                    if (!t.IsNewlineAfter && t.Next != null)
                    {
                        Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null);
                        if (npt != null && npt.IsNewlineAfter && npt.Morph.Case.IsGenitive)
                        {
                            tok = null;
                        }
                        else if (npt == null)
                        {
                            tok = null;
                        }
                    }
                }
                if ((typ == BlkTyps.Intro && tok != null && !tok.IsNewlineAfter) && t.IsValue("ВВЕДЕНИЕ", null))
                {
                    Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(t.Next, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null);
                    if (npt != null && npt.Morph.Case.IsGenitive)
                    {
                        tok = null;
                    }
                }
                if (tok != null)
                {
                    if (res.NumberEnd == null)
                    {
                        res.NumberEnd = tok.EndToken;
                        if (res.NumberEnd.EndChar > res.EndChar)
                        {
                            res.EndToken = res.NumberEnd;
                        }
                    }
                    res.Typ = typ;
                    t       = tok.EndToken;
                    if (t.Next != null && t.Next.IsCharOf(":."))
                    {
                        t            = t.Next;
                        res.EndToken = t;
                    }
                    if (t.IsNewlineAfter || t.Next == null)
                    {
                        return(res);
                    }
                    t = t.Next;
                }
            }
            if (t.IsChar('§') && (t.Next is Pullenti.Ner.NumberToken))
            {
                res.Typ       = BlkTyps.Chapter;
                res.NumberEnd = t;
                t             = t.Next;
            }
            if (names != null)
            {
                Pullenti.Ner.Core.TerminToken tok2 = names.TryParse(t, Pullenti.Ner.Core.TerminParseAttr.No);
                if (tok2 != null && tok2.EndToken.IsNewlineAfter)
                {
                    res.EndToken    = tok2.EndToken;
                    res.IsExistName = true;
                    if (res.Typ == BlkTyps.Undefined)
                    {
                        BlockLine li2 = Create((res.NumberEnd == null ? null : res.NumberEnd.Next), null);
                        if (li2 != null && ((li2.Typ == BlkTyps.Literature || li2.Typ == BlkTyps.Intro || li2.Typ == BlkTyps.Conslusion)))
                        {
                            res.Typ = li2.Typ;
                        }
                        else
                        {
                            res.Typ = BlkTyps.Chapter;
                        }
                    }
                    return(res);
                }
            }
            Pullenti.Ner.Token t1 = res.EndToken;
            if ((((t1 is Pullenti.Ner.NumberToken) || t1.IsChar('.'))) && t1.Previous != null)
            {
                t1 = t1.Previous;
                if (t1.IsChar('.'))
                {
                    res.HasContentItemTail = true;
                    for (; t1 != null && t1.BeginChar > res.BeginChar; t1 = t1.Previous)
                    {
                        if (!t1.IsChar('.'))
                        {
                            break;
                        }
                    }
                }
            }
            res.IsAllUpper = true;
            for (; t != null && t.EndChar <= t1.EndChar; t = t.Next)
            {
                if (!(t is Pullenti.Ner.TextToken) || !t.Chars.IsLetter)
                {
                    res.NotWords++;
                }
                else
                {
                    Pullenti.Morph.MorphClass mc = t.GetMorphClassInDictionary();
                    if (mc.IsUndefined)
                    {
                        res.NotWords++;
                    }
                    else if (t.LengthChar > 2)
                    {
                        res.Words++;
                    }
                    if (!t.Chars.IsAllUpper)
                    {
                        res.IsAllUpper = false;
                    }
                    if ((t as Pullenti.Ner.TextToken).IsPureVerb)
                    {
                        if (!(t as Pullenti.Ner.TextToken).Term.EndsWith("ING"))
                        {
                            res.HasVerb = true;
                        }
                    }
                }
            }
            if (res.Typ == BlkTyps.Undefined)
            {
                Pullenti.Ner.Core.NounPhraseToken npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse((res.NumberEnd == null ? res.BeginToken : res.NumberEnd.Next), Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null);
                if (npt != null)
                {
                    if (npt.Noun.IsValue("ХАРАКТЕРИСТИКА", null) || npt.Noun.IsValue("СОДЕРЖАНИЕ", "ЗМІСТ"))
                    {
                        bool ok = true;
                        for (Pullenti.Ner.Token tt = npt.EndToken.Next; tt != null && tt.EndChar <= res.EndChar; tt = tt.Next)
                        {
                            if (tt.IsChar('.'))
                            {
                                continue;
                            }
                            Pullenti.Ner.Core.NounPhraseToken npt2 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(tt, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null);
                            if (npt2 == null || !npt2.Morph.Case.IsGenitive)
                            {
                                ok = false;
                                break;
                            }
                            tt = npt2.EndToken;
                            if (tt.EndChar > res.EndChar)
                            {
                                res.EndToken = tt;
                                if (!tt.IsNewlineAfter)
                                {
                                    for (; res.EndToken.Next != null; res.EndToken = res.EndToken.Next)
                                    {
                                        if (res.EndToken.IsNewlineAfter)
                                        {
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                        if (ok)
                        {
                            res.Typ         = BlkTyps.Intro;
                            res.IsExistName = true;
                        }
                    }
                    else if (npt.Noun.IsValue("ВЫВОД", "ВИСНОВОК") || npt.Noun.IsValue("РЕЗУЛЬТАТ", "ДОСЛІДЖЕННЯ"))
                    {
                        bool ok = true;
                        for (Pullenti.Ner.Token tt = npt.EndToken.Next; tt != null && tt.EndChar <= res.EndChar; tt = tt.Next)
                        {
                            if (tt.IsCharOf(",.") || tt.IsAnd)
                            {
                                continue;
                            }
                            Pullenti.Ner.Core.NounPhraseToken npt1 = Pullenti.Ner.Core.NounPhraseHelper.TryParse(tt, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null);
                            if (npt1 != null)
                            {
                                if (npt1.Noun.IsValue("РЕЗУЛЬТАТ", "ДОСЛІДЖЕННЯ") || npt1.Noun.IsValue("РЕКОМЕНДАЦИЯ", "РЕКОМЕНДАЦІЯ") || npt1.Noun.IsValue("ИССЛЕДОВАНИЕ", "ДОСЛІДЖЕННЯ"))
                                {
                                    tt = npt1.EndToken;
                                    if (tt.EndChar > res.EndChar)
                                    {
                                        res.EndToken = tt;
                                        if (!tt.IsNewlineAfter)
                                        {
                                            for (; res.EndToken.Next != null; res.EndToken = res.EndToken.Next)
                                            {
                                                if (res.EndToken.IsNewlineAfter)
                                                {
                                                    break;
                                                }
                                            }
                                        }
                                    }
                                    continue;
                                }
                            }
                            ok = false;
                            break;
                        }
                        if (ok)
                        {
                            res.Typ         = BlkTyps.Conslusion;
                            res.IsExistName = true;
                        }
                    }
                    if (res.Typ == BlkTyps.Undefined && npt != null && npt.EndChar <= res.EndChar)
                    {
                        bool ok   = false;
                        int  publ = 0;
                        if (_isPub(npt))
                        {
                            ok   = true;
                            publ = 1;
                        }
                        else if ((npt.Noun.IsValue("СПИСОК", null) || npt.Noun.IsValue("УКАЗАТЕЛЬ", "ПОКАЖЧИК") || npt.Noun.IsValue("ПОЛОЖЕНИЕ", "ПОЛОЖЕННЯ")) || npt.Noun.IsValue("ВЫВОД", "ВИСНОВОК") || npt.Noun.IsValue("РЕЗУЛЬТАТ", "ДОСЛІДЖЕННЯ"))
                        {
                            if (npt.EndChar == res.EndChar)
                            {
                                return(null);
                            }
                            ok = true;
                        }
                        if (ok)
                        {
                            if (npt.BeginToken == npt.EndToken && npt.Noun.IsValue("СПИСОК", null) && npt.EndChar == res.EndChar)
                            {
                                ok = false;
                            }
                            for (Pullenti.Ner.Token tt = npt.EndToken.Next; tt != null && tt.EndChar <= res.EndChar; tt = tt.Next)
                            {
                                if (tt.IsCharOf(",.:") || tt.IsAnd || tt.Morph.Class.IsPreposition)
                                {
                                    continue;
                                }
                                if (tt.IsValue("ОТРАЖЕНЫ", "ВІДОБРАЖЕНІ"))
                                {
                                    continue;
                                }
                                npt = Pullenti.Ner.Core.NounPhraseHelper.TryParse(tt, Pullenti.Ner.Core.NounPhraseParseAttr.No, 0, null);
                                if (npt == null)
                                {
                                    ok = false;
                                    break;
                                }
                                if (((_isPub(npt) || npt.Noun.IsValue("РАБОТА", "РОБОТА") || npt.Noun.IsValue("ИССЛЕДОВАНИЕ", "ДОСЛІДЖЕННЯ")) || npt.Noun.IsValue("АВТОР", null) || npt.Noun.IsValue("ТРУД", "ПРАЦЯ")) || npt.Noun.IsValue("ТЕМА", null) || npt.Noun.IsValue("ДИССЕРТАЦИЯ", "ДИСЕРТАЦІЯ"))
                                {
                                    tt = npt.EndToken;
                                    if (_isPub(npt))
                                    {
                                        publ++;
                                    }
                                    if (tt.EndChar > res.EndChar)
                                    {
                                        res.EndToken = tt;
                                        if (!tt.IsNewlineAfter)
                                        {
                                            for (; res.EndToken.Next != null; res.EndToken = res.EndToken.Next)
                                            {
                                                if (res.EndToken.IsNewlineAfter)
                                                {
                                                    break;
                                                }
                                            }
                                        }
                                    }
                                    continue;
                                }
                                ok = false;
                                break;
                            }
                            if (ok)
                            {
                                res.Typ         = BlkTyps.Literature;
                                res.IsExistName = true;
                                if (publ == 0 && (res.EndChar < (((res.Kit.Sofa.Text.Length * 2) / 3))))
                                {
                                    if (res.NumberEnd != null)
                                    {
                                        res.Typ = BlkTyps.Misc;
                                    }
                                    else
                                    {
                                        res.Typ = BlkTyps.Undefined;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            return(res);
        }
Beispiel #3
0
        public static BlockTitleToken TryAttach(Pullenti.Ner.Token t, bool isContentItem = false, Pullenti.Ner.Core.TerminCollection names = null)
        {
            if (t == null)
            {
                return(null);
            }
            if (!t.IsNewlineBefore)
            {
                return(null);
            }
            if (t.Chars.IsAllLower)
            {
                return(null);
            }
            BlockLine li = BlockLine.Create(t, names);

            if (li == null)
            {
                return(null);
            }
            if (li.Words == 0 && li.Typ == BlkTyps.Undefined)
            {
                return(null);
            }
            if (li.Typ == BlkTyps.Index)
            {
            }
            if (li.IsExistName)
            {
                return new BlockTitleToken(t, li.EndToken)
                       {
                           Typ = li.Typ
                       }
            }
            ;
            if (li.EndToken == li.NumberEnd || ((li.EndToken.IsCharOf(".:") && li.EndToken.Previous == li.NumberEnd)))
            {
                BlockTitleToken res2 = new BlockTitleToken(t, li.EndToken)
                {
                    Typ = li.Typ
                };

                if (li.Typ == BlkTyps.Chapter || li.Typ == BlkTyps.Appendix)
                {
                    BlockLine li2 = BlockLine.Create(li.EndToken.Next, names);
                    if ((li2 != null && li2.Typ == BlkTyps.Undefined && li2.IsAllUpper) && li2.Words > 0)
                    {
                        res2.EndToken = li2.EndToken;
                        for (Pullenti.Ner.Token tt = res2.EndToken.Next; tt != null; tt = tt.Next)
                        {
                            li2 = BlockLine.Create(tt, names);
                            if (li2 == null)
                            {
                                break;
                            }
                            if (li2.Typ != BlkTyps.Undefined || !li2.IsAllUpper || li2.Words == 0)
                            {
                                break;
                            }
                            tt = (res2.EndToken = li2.EndToken);
                        }
                    }
                }
                return(res2);
            }
            if (li.NumberEnd == null)
            {
                return(null);
            }
            BlockTitleToken res = new BlockTitleToken(t, li.EndToken)
            {
                Typ = li.Typ
            };

            if (res.Typ == BlkTyps.Undefined)
            {
                if (li.Words < 1)
                {
                    return(null);
                }
                if (li.HasVerb)
                {
                    return(null);
                }
                if (!isContentItem)
                {
                    if (!li.IsAllUpper || li.NotWords > (li.Words / 2))
                    {
                        return(null);
                    }
                }
                res.Typ = BlkTyps.Chapter;
                if ((li.NumberEnd.EndChar - t.BeginChar) == 7 && li.NumberEnd.Next != null && li.NumberEnd.Next.IsHiphen)
                {
                    res.Typ = BlkTyps.Undefined;
                }
            }
            if (li.HasContentItemTail && isContentItem)
            {
                res.Typ = BlkTyps.IndexItem;
            }
            if (res.Typ == BlkTyps.Chapter || res.Typ == BlkTyps.Appendix)
            {
                if (li.HasVerb)
                {
                    return(null);
                }
                if (li.NotWords > li.Words && !isContentItem)
                {
                    return(null);
                }
                for (t = li.EndToken.Next; t != null; t = t.Next)
                {
                    BlockLine li2 = BlockLine.Create(t, names);
                    if (li2 == null)
                    {
                        break;
                    }
                    if (li2.HasVerb || (li2.Words < 1))
                    {
                        break;
                    }
                    if (!li2.IsAllUpper && !isContentItem)
                    {
                        break;
                    }
                    if (li2.Typ != BlkTyps.Undefined || li2.NumberEnd != null)
                    {
                        break;
                    }
                    t = (res.EndToken = li2.EndToken);
                    if (isContentItem && li2.HasContentItemTail)
                    {
                        res.Typ = BlkTyps.IndexItem;
                        break;
                    }
                }
            }
            for (Pullenti.Ner.Token tt = res.EndToken; tt != null && tt.BeginChar > li.NumberEnd.EndChar; tt = tt.Previous)
            {
                if ((tt is Pullenti.Ner.TextToken) && tt.Chars.IsLetter)
                {
                    res.Value = Pullenti.Ner.Core.MiscHelper.GetTextValue(li.NumberEnd.Next, tt, Pullenti.Ner.Core.GetTextAttr.No);
                    break;
                }
            }
            if ((res.Typ == BlkTyps.Index || res.Typ == BlkTyps.Intro || res.Typ == BlkTyps.Conslusion) || res.Typ == BlkTyps.Literature)
            {
                if (res.Value != null && res.Value.Length > 100)
                {
                    return(null);
                }
                if (li.Words < li.NotWords)
                {
                    return(null);
                }
            }
            return(res);
        }
    }