Пример #1
0
 public languageModule(ISpiderEvaluatorBase __parent, basicLanguage __langA, basicLanguage __langB)
     : base("Language Module", "The Targets are distributed into layers by the Passive rules and Active rules testing the tokens of the Target.", __parent)
 {
     languageA = __langA;
     languageB = __langB;
     //  setup();
 }
Пример #2
0
 public layerLanguageTFIDF_ARule(basicLanguage __language, int __layerID, ISpiderEvaluatorBase __parent, int __layerID2 = -1)
     : base("Language TF-IDF Test ({0})", "Tests Target tokens against the specified language [{0}], sets layerID [{1}] and calculates layer weight score as sum of matched Target token TF-IDFs minus sum of unmatched."
            + "If resulting weight score is more than 0 the layerID [{1}] is assigned, if it's less than 0 then the layer2ID [{2}] is assigned", __layerID, __parent, __layerID2)
 {
     language    = __language;
     name        = string.Format(name, language.languageEnglishName);
     description = string.Format(description, language.languageEnglishName, layerID, layer2ID);
 }
 /// <summary>
 /// Returns a basic language object with loaded dictionary file
 /// </summary>
 /// <param name="languageID">The language identifier.</param>
 /// <returns></returns>
 public static basicLanguage GetBasicLanguage(basicLanguageEnum languageID)
 {
     if (basicLanguageRegistry.ContainsKey(languageID))
     {
         basicLanguage output = basicLanguageRegistry[languageID];
         output.checkHuspell();
         return(output);
     }
     return(null);
 }
Пример #4
0
        /// <summary>
        /// GLAVNA KOMANDA KOD TOKENIZACIJE - Za prosledjen paragraph pravi recenice, podrecenice i tokene. Vrsi registrovanje tokena i recenica u IContentPage output-u ako bude prosledjen
        /// </summary>
        /// <typeparam name="TSentence">Tip za recenice</typeparam>
        /// <typeparam name="TSubSentence">Tip za pod recenice</typeparam>
        /// <typeparam name="TToken">Tip za tokene</typeparam>
        /// <param name="paragraph"></param>
        /// <param name="resources">IContentPage za registraciju sadrzaja;  paragraphDetectionFlags; sentenceDetectionFlags; contentPreprocessFlags;tokenDetectionFlags;tokenDetectionFlags</param>
        public virtual void setParagraphFromContent <TSentence, TSubSentence, TToken>(params object[] resources)
            where TSentence : IContentSentence, new()
            where TSubSentence : IContentSubSentence, new()
            where TToken : class, IContentToken, new()
        {
            IContentPage output = resources.getFirstOfType <IContentPage>();

            basicLanguage basicLanguages = resources.getFirstOfType <basicLanguage>();

            if (basicLanguages == null)
            {
                basicLanguages = new basicLanguage();
            }

            // IContentBlock block = resources.getOfType<IContentBlock>();

            paragraphDetectionFlag flags           = resources.getFirstOfType <paragraphDetectionFlag>();
            sentenceDetectionFlag  sentenceFlags   = resources.getFirstOfType <sentenceDetectionFlag>();
            contentPreprocessFlag  preprocessFlags = resources.getFirstOfType <contentPreprocessFlag>();
            //  subsentenceDetectionFlags subsentenceFlags = new subsentenceDetectionFlags(resources);
            tokenDetectionFlag tokenFlags = resources.getFirstOfType <tokenDetectionFlag>(); // new tokenDetectionFlags(resources);

            contentSentenceCollection snt = _setSentencesFromContent <TSentence>(sentenceFlags, preprocessFlags);

            // sentenceDetection._setSentencesFromContent<TSentence>(paragraph, sentenceFlags, preprocessFlags);

            foreach (TSentence sn in snt)
            {
                // sn._setTokensForSentence<TSubSentence>(sentenceFlags, tokenFlags);
                var tkns = sn.setTokensFromContent <TToken, TSubSentence>(flags, sentenceFlags, preprocessFlags,
                                                                          tokenFlags, resources, basicLanguages);

                //tokenDetection.setTokensFromContent<TToken, TSubSentence>(sn, subsentenceFlags, tokenFlags);

                if (flags.HasFlag(paragraphDetectionFlag.dropSentenceWithNoToken))
                {
                    if (sn.items.Count == 0)
                    {
                        continue;
                    }
                }
                if (sentenceFlags.HasFlag(sentenceDetectionFlag.setSentenceToParagraph))
                {
                    setItem(sn);
                }

                //if (output != null)
                //{
                //    output.sentences.Add(sn);
                //    output.tokens.CollectAll(sn.items);
                //}
            }
        }
Пример #5
0
        /// <summary>
        /// Returns a basic language object with loaded dictionary file
        /// </summary>
        /// <param name="languageID">The language identifier.</param>
        /// <returns></returns>
        public static basicLanguage GetBasicLanguage(basicLanguageEnum languageID)
        {
            basicLanguage language = new basicLanguage();

            language.affixFilePath       = languageDataSet[languageID][basicLanguageParameterEnum.affixPath];
            language.dictFilePath        = languageDataSet[languageID][basicLanguageParameterEnum.dictPath];
            language.languageNativeName  = languageDataSet[languageID][basicLanguageParameterEnum.nativeName];
            language.languageEnglishName = languageDataSet[languageID][basicLanguageParameterEnum.englishName];
            language.iso2Code            = languageDataSet[languageID][basicLanguageParameterEnum.iso2code];
            language.langIDNeedles.AddRange(languageDataSet[languageID][basicLanguageParameterEnum.needles].Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries));
            language.checkHuspell(true);

            return(language);
        }
        /// <summary>
        /// Poziva detekciju generickih tipova za recenice, paragrafe i tokene. Odmrzava kolekcije u tokenizedContent
        /// </summary>
        /// <param name="tokenizedContent"></param>
        /// <param name="settings"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        public static string detectGenericTypes(IContentPage tokenizedContent, nlpTokenizatorSettings settings,
                                                basicLanguage language)
        {
            if (settings == null)
            {
                return("");
            }
            if (tokenizedContent == null)
            {
                return("");
            }

            StringBuilder sb = new StringBuilder();

            //if (settings.doTokenTypeDetection_basic)
            //{
            //    tokenCategorization.tokenAnalysis(tokenizedContent, settings, language);
            //}

            if (settings.doSentenceDetection)
            {
                blokCategorization.sentenceAnalysis(tokenizedContent, settings, language);
            }

            if (settings.doParagraphDetection)
            {
                blokCategorization.paragraphAnalysis(tokenizedContent, settings, language);
            }

            if (settings.doBlockDetection)
            {
                blokCategorization.blockAnalysis(tokenizedContent, settings, language);
            }

            /*
             * tokenizedContent.tokens.unfreeze();
             * tokenizedContent.paragraphs.unfreeze();
             * tokenizedContent.sentences.unfreeze();
             * tokenizedContent.items.unfreeze();
             */
            return(sb.ToString());
        }
        /// <summary>
        /// 2013C: Ovo je bitno da bude pozvano kako bi uspesno referencirao ovu Biblioteku!! -- > TREBA DA GA POZOVE manager.onApplicationReady()
        /// </summary>
        public static void Prepare()
        {
            aceLog.consoleControl.setAsOutput(log, "lang_mng");

            if (imbNLPDataConfig.settings.DoLoadBasicLanguageDefinitions)
            {
                String    hunListPath = appManager.Application.folder_resources.findFile(imbNLPDataConfig.settings.BasicLanguageDefinitionsList, SearchOption.AllDirectories);
                DataTable dt          = hunListPath.deserializeDataTable(imbSCI.Data.enums.reporting.dataTableExportEnum.excel);
                //dt.Rows.GetEnumerator
                Parallel.ForEach <DataRow>(dt.Rows.ToList(), (rw) =>
                {
                    basicLanguage bl = new basicLanguage();
                    bl.deploy(rw);

                    if (bl.languageEnum != basicLanguageEnum.unknown)
                    {
                        basicLanguageRegistry[bl.languageEnum] = bl;

                        log.log("Hunspell dictionary entry for [" + bl.languageEnum + "] found");
                    }
                    else
                    {
                        log.log("Hunspell dictionary entry failed [" + bl.languageEnum + "] found");
                    }
                });
            }


            //     basicLanguageRegistry[basicLanguageEnum.english].testBoolean("known", basicLanguageCheck.spellCheck);

            //foreach (DataRow dr in dt.Rows)
            //{
            //    basicLanguage bl = new basicLanguage();
            //    bl.deploy(dr);

            //    if (bl.languageEnum != basicLanguageEnum.unknown)
            //    {
            //        basicLanguageRegistry[bl.languageEnum] = bl;

            //            log.log("Hunspell dictionary entry for [" + bl.languageEnum + "] found");

            //    } else
            //    {
            //        log.log("Hunspell dictionary entry failed [" + bl.languageEnum + "] found");
            //    }
            //}

            //  languageManagerApertium.manager.prepare();

            //languageManagerAlphabet.manager.prepare();
            //  languageManagerDictionary.manager.prepare();
            //  languageManagerElements.manager.prepare();

            //languageManagerHunspell.manager.prepare();

            //  languageManagerLexicon.manager.prepare();
            // languageManagerMorphology.manager.prepare();

            //languageManagerUnitex.manager.prepare();

            //languageManagerWordnet.manager.prepare();
            // languageManagerDict.manager.prepare();

            //languageManagerDBNamedEntities.manager.prepare();

            //semanticLexicon.semanticLexiconManager.manager.prepare();

            aceLog.consoleControl.removeFromOutput(log);
        }
Пример #8
0
        /// <summary>
        /// Vrsi tokenizaciju String/PlainText sadrzaja.
        /// </summary>
        /// <param name="resources">Preporuceni resursi: String content, basicLanguage language, node page </param>
        /// <returns></returns>
        public IContentPage tokenizeContent(params object[] resources)
        {
            string        content  = resources.getFirstOfType <string>();
            basicLanguage language = resources.getFirstOfType <basicLanguage>();
            node          page     = resources.getFirstOfType <node>();


            contentPage output = new contentPage();

            // output.sourceContent = content;

            output.acceptSourcePage(page);


            try
            {
                string source = content;

                // preprocess
                source = compressNewLines(source);
                output.sourceContent = source;
                //  source = imbFilterModuleEngine.executeSimple(settings.contentFilter, source);

                output.content = source;


                string[] blocks = source.Split(new string[] { Environment.NewLine + Environment.NewLine },
                                               StringSplitOptions.RemoveEmptyEntries);

                if (blocks.Count() == 0)
                {
                    blocks[0] = source;
                }
                List <contentParagraph> pars = null;
                foreach (string bl in blocks)
                {
                    string blc = bl.StripHTML();

                    // blc = imbStringReporting.imbHtmlDecode(blc);

                    blc = SecurityElement.Escape(blc);

                    contentBlock tmpBlock = new contentBlock();
                    tmpBlock.sourceContent = blc;

                    tmpBlock.content = blc;
                    output.items.Add(tmpBlock);
                }

                foreach (IContentBlock bl in output.items)
                {
                    // getting paragraphs
                    string[] paragraphs = bl.sourceContent.Split(new string[] { Environment.NewLine },
                                                                 StringSplitOptions.RemoveEmptyEntries);
                    foreach (string par in paragraphs)
                    {
                        if (string.IsNullOrEmpty(par))
                        {
                            continue;
                        }

                        contentParagraph po = new contentParagraph(par, output);

                        po.setParagraphFromContent <contentSentence, contentSubSentence, contentToken>(output, paragraphDetectionFlag.dropSentenceWithNoToken,
                                                                                                       sentenceDetectionFlag.
                                                                                                       setSentenceToParagraph,
                                                                                                       sentenceDetectionFlag.
                                                                                                       preprocessParagraphContent,
                                                                                                       tokenDetectionFlag.standardDetection,
                                                                                                       contentPreprocessFlag.standard);

                        if (po.items.Any())
                        {
                            output.paragraphs.Add(po);

                            foreach (IContentSentence sn in po.items)
                            {
                                output.sentences.Add(sn);
                                foreach (IContentToken tk in sn.items)
                                {
                                    output.tokens.Add(tk);
                                }
                            }
                            //output.tokens.AddRange();
                            bl.setItem(po);
                        }
                    }
                }



                output.primaryFlaging(resources);

                output.secondaryFlaging(resources);

                output.generalSemanticsFlaging(resources);

                output.specialSematicsFlaging(resources);



                //tokenCategorization.tokenAnalysis(output, settings, language);

                //if (settings.doTokenTypeDetection_basic)
                //{
                //
                //}

                //if (settings.doSentenceDetection)
                //{
                //    blokCategorization.sentenceAnalysis(output, settings, language);
                //}
            }
            catch (Exception ex)
            {
                var isb = new StringBuilder();
                isb.AppendLine("plainTextTokenizator error");
                isb.AppendLine("Language: " + language.toStringSafe());
                // devNoteManager.note(this, ex, isb.ToString(), "plainTextTokenizator", devNoteType.tokenization);
            }

            return(output);
        }
        public IContentPage tokenizeContent(string content, basicLanguage language)
        {
            contentPage output = new contentPage();

            return(output);
        }
Пример #10
0
        /*
         * /// <summary>
         * /// Osnovna obrada na osnovu jezika> da li je poznata rec u pitanju ili nije -
         * /// </summary>
         * /// <param name="token"></param>
         * /// <param name="language"></param>
         * private static void deployTokenLanguageBasic(IContentToken token, basicLanguage language)
         * {
         *
         *  switch (token.genericType)
         *  {
         *      case nlpTokenGenericType.unknownWord:
         *          if (language.isKnownWord(token.content))
         *          {
         *              token.genericType = nlpTokenGenericType.knownWord;
         *          }
         *          break;
         *      case nlpTokenGenericType.number:
         *          break;
         *
         *  }
         *
         * }
         */

        /*
         * /// <summary>
         * /// FAZA 3: Dodatna obrada tokena na osnovu jezickih podesavanja -- nije jos implementirano!!!
         * /// </summary>
         * /// <param name="token"></param>
         * /// <param name="language"></param>
         * private static void deployTokenLanguage(IContentToken token, basicLanguage language)
         * {
         *
         *  switch (token.genericType)
         *  {
         *      case nlpTokenGenericType.unknownWord:
         *
         *          //if (language.testBoolean(token.content, basicLanguageCheck.spellCheck))
         *          //{
         *          //    token.genericType = nlpTokenGenericType.knownWord;
         *          //}
         *
         *          // token.wordVariations = languageTools.test<List<string>>(language, token.content, languageModelOperation.getVariations) as List<string>;
         *          // List<string> stems = languageTools.test<List<string>>(language, token.content, languageModelOperation.getStems) as List<string>;
         *          //token.wordRoot = imbStringOperations.longestCommonSubstring(token.wordVariations);
         *
         *          //token.wordRoot = stems[0];
         *          break;
         *      case nlpTokenGenericType.number:
         *
         *
         *          break;
         *
         *  }
         *
         * }
         */
        /// <summary>
        /// FAZA 2: podesava letter case, proverava jezik, proverava da li je mozda akronim - funkcionise samo ako su detektovani slogovi
        /// </summary>
        /// <param name="token"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        private static nlpTokenGenericType findGenericTypeSecond(IContentToken token, basicLanguage language)
        {
            nlpTokenGenericType output = token.genericType;
            object testOut;

            /*
             *
             * if (token.tokenBaseType == nlpTokenBaseType.word)
             * {
             *  token.letterCase = nlpTextCase.unknown;
             *  if (tokenization.wordWithCapitalStart.IsMatch(token.content)) token.letterCase = nlpTextCase.firstUpperRestLower;
             *  if (token.letterCase == nlpTextCase.unknown) if (token.content.ToLower() == token.content) token.letterCase = nlpTextCase.lowerCase;
             *  if (token.letterCase == nlpTextCase.unknown) if (token.content.ToUpper() == token.content) token.letterCase = nlpTextCase.upperCase;
             *  if (token.letterCase == nlpTextCase.unknown) token.letterCase = nlpTextCase.mixedCase;
             * }
             */

            if (token.flags == contentTokenFlag.languageWord)
            {
                if (language.testBoolean(token.content, basicLanguageCheck.spellCheck))
                {
                    token.flags = token.flags.Add(contentTokenFlag.languageKnownWord);

                    output = nlpTokenGenericType.knownWord;
                }
                else
                {
                    if (token.flags.getEnumListFromFlags().ContainsOneOrMore(contentTokenFlag.acronim, contentTokenFlag.acronimDiscovered, contentTokenFlag.acronimKnown))
                    {
                        output = nlpTokenGenericType.wordAbrevation;
                    }
                    else
                    {
                        if (token.flags.HasFlag(contentTokenFlag.caseAllUpper))
                        {
                            contentToken pt = token.parent as contentToken;
                            if (pt != null)
                            {
                                if (pt.flags.HasFlag(contentTokenFlag.subsentence_title))
                                {
                                    token.flags = token.flags.Add(contentTokenFlag.title);
                                }
                                else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information))
                                {
                                    token.flags = token.flags.Add(contentTokenFlag.namedEntity);
                                }
                            }
                            else
                            {
                                token.flags = token.flags.Add(contentTokenFlag.titleOneWord);
                            }
                        }
                        else if (token.flags.HasFlag(contentTokenFlag.caseFirstUpper))
                        {
                            contentToken pt = token.parent as contentToken;
                            if (pt != null)
                            {
                                if (pt.flags.HasFlag(contentTokenFlag.subsentence_title))
                                {
                                    token.flags = token.flags.Add(contentTokenFlag.title);
                                }
                                else if (pt.flags.HasFlag(contentTokenFlag.subsentence_information))
                                {
                                    token.flags = token.flags.Add(contentTokenFlag.namedEntity);
                                }
                                else
                                {
                                    if (!token.isFirst)
                                    {
                                        token.flags = token.flags.Add(contentTokenFlag.namedEntity);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            token.genericType = output;

            return(output);
        }
Пример #11
0
 /// <summary>
 /// OSNOVNA ANALIZA TOKENA: sprovodi od FAZE 1 do FAZE 3 - poziva nlpBase alate za svaki od tokena
 /// </summary>
 /// <param name="content"></param>
 /// <param name="language"></param>
 internal static void tokenAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language)
 {
     /*
      * if (content.tokens == null)
      * {
      *  return ;
      * }
      *
      * // FAZA 1>
      * if (settings.doTokenTypeDetection_basic)
      * foreach (IContentToken tk in content.tokens)
      * {
      *  tk.genericType = tokenCategorization.findGenericTypeBasic(tk);
      * }
      *
      * // FAZA 2>
      * if (settings.doTokenTypeDetection_second)
      * foreach (IContentToken tk in content.tokens)
      * {
      *
      *  // izgradnja syllables-a
      * //  tk.syllablesDetection(settings);
      *
      *  tk.genericType = tokenCategorization.findGenericTypeSecond(tk, language);
      * }
      *
      *
      * // Faza 3
      * if (settings.doTokenTypeDetection_languageBasic)
      * foreach (IContentToken tk in content.tokens)
      * {
      *  deployTokenLanguageBasic(tk, language);
      * }
      *
      *
      * // Faza 4>
      * if (settings.doTokenTypeDetection_languageAdvanced)
      * foreach (IContentToken tk in content.tokens)
      * {
      *  deployTokenLanguage(tk, language);
      * }
      */
 }
Пример #12
0
 /// <summary>
 /// Initializes a new instance of the <see cref="ruleHasLanguageName"/> class.
 /// </summary>
 /// <param name="__parent">The parent.</param>
 public ruleHasLanguageName(spiderEvaluatorSimpleBase __parent, basicLanguage __language) : base("Language id/name in url/caption", "If language name (native or english) or iso-2-code found in url or caption", 5, 0, __parent)
 {
     language = __language;
 }
Пример #13
0
        /// <summary>
        /// 2013C: Ovo je bitno da bude pozvano kako bi uspesno referencirao ovu Biblioteku!! -- > TREBA DA GA POZOVE manager.onApplicationReady()
        /// </summary>
        public static void Prepare()
        {
            aceLog.consoleControl.setAsOutput(log, "lang_mng");

            //basicLanguageCollection.isReadyGlobal = true;

            //  Int32 ln = languages.loadItems(false, 5);
            //  log.log("Hunspell language definitions loaded [" + ln + "]");

            if (_serbian == null)
            {
                _serbian = new extendedLanguage();

                // _serbian.morphologies_verbs.buildDefaultItem();
                // _serbian.morphologies_nouns.buildDefaultItem();



                if (!File.Exists(PATH_hunspell_aff))
                {
                    throw new aceGeneralException("Hunspell AFF file missing!");
                }
                if (!File.Exists(PATH_hunspell_dict))
                {
                    throw new aceGeneralException("Hunspell DICT file missing!");
                }

                basicLanguage language = new basicLanguage();
                language.affixFilePath       = PATH_hunspell_aff;
                language.dictFilePath        = PATH_hunspell_dict;
                language.languageNativeName  = "Srpski";
                language.languageEnglishName = "Serbian";
                language.iso2Code            = "sr";

                _serbian.basic = language;

                if (language.checkHuspell(true))
                {
                    log.log("Hunspell language module: " + language.languageEnglishName + " ready");
                }
                else
                {
                    aceGeneralException axe = new aceGeneralException("Serbian language Hunspell module failed", null, english, "Serbian Hunspell failed");
                    throw axe;
                }


                _serbian.basic = language;
                _serbian.loadAlfabet("extended\\alfabet.txt");
            }

            if (_english == null)
            {
                _english = new extendedLanguage
                {
                    basic = new basicLanguage("en")
                };
                _english.basic.affixFilePath       = PATH_hunspell_aff_en;
                _english.basic.dictFilePath        = PATH_hunspell_dict_en;
                _english.basic.languageNativeName  = "Engleski";
                _english.basic.languageEnglishName = "English";
                _english.basic.iso2Code            = "en";

                if (_english.basic.checkHuspell(true))
                {
                    log.log("Hunspell language module: " + _english.basic.languageEnglishName + " ready");
                }
                else
                {
                    aceGeneralException axe = new aceGeneralException("English language Hunspell module failed", null, english, "English Hunspell failed");
                    throw axe;
                }
            }


            //languages.loadItems(false, -1, 0, System.Data.LoadOption.OverwriteChanges);

            //dictionaryManager.prepare();
            //elementsManager.prepare();

            languageManagerApertium.manager.prepare();

            languageManagerAlphabet.manager.prepare();
            //  languageManagerDictionary.manager.prepare();
            //  languageManagerElements.manager.prepare();

            languageManagerHunspell.manager.prepare();

            //  languageManagerLexicon.manager.prepare();
            // languageManagerMorphology.manager.prepare();

            languageManagerUnitex.manager.prepare();

            languageManagerWordnet.manager.prepare();
            // languageManagerDict.manager.prepare();

            languageManagerDBNamedEntities.manager.prepare();

            semanticLexicon.semanticLexiconManager.manager.prepare();


            if (serbian.basic.hunspellEngine.Spell("Proba"))
            {
                log.log("Hunspell language module: " + serbian.basic.languageEnglishName + " Spell check working");
            }


            if (english.basic.hunspellEngine.Spell("Test"))
            {
                log.log("Hunspell language module: " + english.basic.languageEnglishName + " Spell check working");
            }

            aceLog.consoleControl.removeFromOutput(log);
        }
Пример #14
0
        //public static nlpSentenceBasicType getBasicType(nlpSentenceGenericType input)
        //{
        //    String name = input.ToString();
        //    if (name.StartsWith("normal")) return nlpSentenceBasicType.normal;
        //    if (name.StartsWith("open")) return nlpSentenceBasicType.open;
        //    if (name.StartsWith("role")) return nlpSentenceBasicType.role;
        //    return nlpSentenceBasicType.unknown;
        //}

        internal static void sentenceAnalysis(IContentPage content, nlpTokenizatorSettings settings,
                                              basicLanguage language)
        {
            return;

            /*
             * foreach (IContentSentence sentence in content.sentences)
             * {
             *  if (sentence.genericType == nlpSentenceGenericType.unknown)
             *  {
             *      String spliter = sentence.spliter.Trim();
             *
             *      Boolean firstCaseOk = (sentence.items.First().letterCase == nlpTextCase.firstUpperRestLower);
             *
             *      var prevSentence = sentence.prev as IContentSentence;
             *      if (prevSentence == null) prevSentence = sentence;
             *
             *      if (prevSentence.genericType == nlpSentenceGenericType.list_startSentence)
             *      {
             *          sentence.genericType = nlpSentenceGenericType.list_item;
             *      } else
             *      {
             *          switch (spliter)
             *          {
             *              case tokenization.sentenceEnd_arrowRight:
             *              case tokenization.sentenceEnd_arrowLeft:
             *                  sentence.genericType = nlpSentenceGenericType.role_title;
             *                  break;
             *              case tokenization.sentenceEnd_notFinished2:
             *              case tokenization.sentenceEnd_notFinished:
             *                  if (firstCaseOk)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.normal_unfinished;
             *                  }
             *                  break;
             *              case tokenization.sentenceEnd_question:
             *                  if (firstCaseOk)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.normal_question;
             *                  }
             *
             *                  break;
             *              case tokenization.sentenceEnd_normal:
             *                  if (firstCaseOk)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.normal;
             *                  }
             *                  break;
             *              case tokenization.sentenceEnd_listStart2:
             *              case tokenization.sentenceEnd_listStart:
             *                  sentence.genericType = nlpSentenceGenericType.list_startSentence;
             *                  break;
             *              case tokenization.sentenceEnd_listItemEnd_listEnd:
             *              case tokenization.sentenceEnd_listItemEnd:
             *                  sentence.genericType = nlpSentenceGenericType.list_item;
             *                  break;
             *              case tokenization.sentenceEnd_exclamation:
             *                  if (firstCaseOk)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.normal_exclamation;
             *                  }
             *                  break;
             *              default:
             *                  if (prevSentence.genericType == nlpSentenceGenericType.list_item)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.list_item;
             *                  }
             *                  else
             *                  {
             *                      if (!String.IsNullOrEmpty(spliter))
             *                      {
             *                          content.note(devNoteType.nlp,
             *                                              "Unknown spliter for sentence: [" + spliter +
             *                                              "] - add support for it in> tokenization.cs constants and sentenceAnalysis()",
             *                                              "blokCategorization");
             *                      }
             *                  }
             *                  break;
             *
             *          }
             *      }
             *
             *      /*
             *      if (sentence.genericType==nlpSentenceGenericType.unknown)
             *      {
             *          if (firstCaseOk)
             *          {
             *              sentence.genericType = nlpSentenceGenericType.normal_unknown;
             *          } else
             *          {
             *              if (sentence.items.All(x => x.letterCase == nlpTextCase.upperCase))
             *              {
             *                  sentence.genericType = nlpSentenceGenericType.role_title;
             *              } else
             *              {
             *                  sentence.genericType = nlpSentenceGenericType.role_simpleText;
             *              }
             *          }
             *      }
             *  }
             *
             * }
             *
             * }*/
        }
Пример #15
0
 internal static void blockAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language)
 {
     if (content.items == null)
     {
         return;
     }
     foreach (IContentBlock block in content.items)
     {
     }
 }
Пример #16
0
        /// <summary>
        /// paragraphDetectionFlags flags, sentenceDetectionFlags sentenceFlags, contentPreprocessFlags preprocessFlags,  tokenDetectionFlags tokenFlags,  String content,  node page, basicLanguage language
        /// </summary>
        /// <param name="resources"></param>
        /// <returns></returns>
        public htmlContentPage tokenizeContent(ILogBuilder pRecordLog, contentTreeGlobalCollection treeGlobalRegistry, webDocument doc, params object[] resources)
        {
            var starttime = DateTime.Now;
            //paragraphDetectionFlags flags = new paragraphDetectionFlags(resources);
            //sentenceDetectionFlags sentenceFlags = new sentenceDetectionFlags(resources);
            //contentPreprocessFlags preprocessFlags = new contentPreprocessFlags(resources);
            //tokenDetectionFlags tokenFlags = new tokenDetectionFlags(resources);


            string        content  = resources.getFirstOfType <string>();
            basicLanguage language = resources.getFirstOfType <basicLanguage>();
            node          page     = resources.getFirstOfType <node>();

            // <------------------------ prepare

            htmlContentPage contentPage = new htmlContentPage();

            //if (!imbSemanticEngineManager.settings.doEnablePageContentTokenization)
            //{
            //    return contentPage;
            //}

            contentPage.acceptSourcePage(page);


            string domain = page.domain; // page.url.getDomainNameFromUrl(true);

            // <---------- prethodna implementacija
            XPathNavigator navigator = doc.getDocumentNavigator();  // resources.getOfType<XPathNavigator>();

            HtmlDocument hapDocument = doc.document as HtmlDocument;

            //List<IEnumerable<HtmlNode>> nodes = hapDocument.DocumentNode.Descendants("input").Select(y => y.Descendants().Where(x => x.InnerText != "")).ToList();

            // <--------------- tree building
            // contentTreeGlobalCollection treeGlobalRegistry = resources.getFirstOfType< contentTreeGlobalCollection>(false, false);

            contentTreeBuilder ctb_old = treeGlobalRegistry.GetTreeBuilder(page.url);
            contentTreeBuilder ctb     = null;
            bool buildTree             = false;

            if (ctb_old != null)
            {
            }
            else
            {
                buildTree = true;
            }

            ctb = ctb_old;


            ctb = contentTreeBuilder.getInstance(navigator, domain, page);

            //ctb.saveCache();

            //if (buildTree) {
            //   // pRecordLog.log("Tree structure not found at global registry (activityJobRecord) - building new. ");

            //}
            contentPage.treeBuilder = ctb;
            //  pRecordLog.log("Tree structure done. ");

            // <-------------------- tree building end

            imbTreeNodeBlockCollection blocks = ctb.tree.breakToBlocks();
            //pRecordLog.log("Blocks extracted from tree structure: " + blocks.Count());

            //flags = paragraphDetectionFlags.getDefaultFlags();
            //sentenceFlags.Add(sentenceDetectionFlag.setSentenceToParagraph,
            //                  sentenceDetectionFlag.preprocessParagraphContent);
            //tokenFlags = tokenDetectionFlags.getDefaultFlags();
            //preprocessFlags = contentPreprocessFlags.getDefaultFlags();

            //pRecordLog.log(nameof(flags) + " => " + flags.toCsvInLine(";"));
            //pRecordLog.log(nameof(sentenceFlags) + " => " + sentenceFlags.toCsvInLine(";"));
            //pRecordLog.log(nameof(tokenFlags) + " => " + tokenFlags.toCsvInLine(";"));
            //pRecordLog.log(nameof(preprocessFlags) + " => " + preprocessFlags.toCsvInLine(";"));


            // pRecordLog.open(bootstrap_containers.well.ToString(), "Block structure analysis", "NLP tokenization using hybrid [" + this.GetType().Name + "] tokenization engine");



            int b = 0;

            for (int bi = 0; bi < blocks.Count; bi++)
            {
                imbTreeNodeBlock bl = blocks[bi];
                b++;
                makeBlock(bl, contentPage, language, resources);
                // pRecordLog.close();
            }

            //pRecordLog.close();

            // pRecordLog.log("Tokenized content structure done. ");


            contentPage.recountItems();

            //pRecordLog.log("Total token counts:");
            //var data = contentPage.AppendDataFields(null);
            //var dt = data.buildDataTable("Token statistics");
            //pRecordLog.AppendTable(dt);

            contentPage.primaryFlaging(resources);
            contentPage.secondaryFlaging(resources);

            // <---------------

            pRecordLog.log("SKIP: complete exploration of all tokens is turned off.");



            // contentPage.saveCache();


            pRecordLog.log("Basic semantic analysis done. Closing the process.");


            var time = DateTime.Now.Subtract(starttime);

            // imbSemanticEngineManager.log.log("Tree-building and tokenization [" + page.url + "] done in: " + time.TotalMilliseconds.getSeconds(4)+"s");
            return(contentPage);
        }
Пример #17
0
        public htmlContentPage tokenizeContent(ILogBuilder pRecordLog, HtmlDocument htmlDoc, basicLanguage language, node page)
        {
            var starttime = DateTime.Now;

            htmlContentPage contentPage = new htmlContentPage();

            contentPage.acceptSourcePage(page);

            string domain = page.domain;

            object[] resources = new object[] { language, page, flags, sentenceFlags, tokenFlags, preprocessFlags };

            var ctb = contentTreeBuilder.getInstance(htmlDoc.CreateNavigator(), domain, page);

            contentPage.treeBuilder = ctb;
            var blocks = ctb.tree.breakToBlocks();

            int b = 0;

            for (int bi = 0; bi < blocks.Count; bi++)
            {
                imbTreeNodeBlock bl = blocks[bi];
                b++;
                makeBlock(bl, contentPage, language, resources);
                // pRecordLog.close();
            }

            contentPage.recountItems();

            contentPage.primaryFlaging(resources);
            contentPage.secondaryFlaging(resources);



            // <---------------

            //  pRecordLog.log("SKIP: complete exploration of all tokens is turned off.");



            // contentPage.saveCache();


            pRecordLog.log("Basic semantic analysis done. Closing the process.");


            var time = DateTime.Now.Subtract(starttime);

            pRecordLog.log("Tree-building and tokenization [" + page.url + "] done in: " + time.TotalMilliseconds.getSeconds(4) + "s");

            return(contentPage);
        }
Пример #18
0
 /// <summary>
 /// Initializes a new instance of the <see cref="ruleUrlHasKnownWords"/> class.
 /// </summary>
 /// <param name="__parent">The parent.</param>
 public ruleUrlHasKnownWords(spiderEvaluatorSimpleBase __parent, basicLanguage __language) : base("Url language words", "If url has words (3+ chars) recognized by Hunspell dictionary", 3, -1, __parent)
 {
     language = __language;
 }
Пример #19
0
        /// <summary>
        /// Vrsi analizu paragrafa - za svaki paragraf unfreeze kolekciju recenica, pokrenuti obavezno posle kategorizacije recenica
        /// </summary>
        /// <param name="content"></param>
        /// <param name="settings"></param>
        /// <param name="language"></param>
        internal static void paragraphAnalysis(IContentPage content, nlpTokenizatorSettings settings,
                                               basicLanguage language)
        {
            if (content.paragraphs == null)
            {
                return;
            }

            /*
             * foreach (IContentParagraph paragraph in content.paragraphs)
             * {
             *  //paragraph.items.unfreeze();
             *  var firstSentence = paragraph.items.First();
             *
             *  if (paragraph.items.Count == 1)
             *  {
             *      switch (firstSentence.genericType)
             *      {
             *          case nlpSentenceGenericType.normal:
             *          case nlpSentenceGenericType.normal_exclamation:
             *          case nlpSentenceGenericType.normal_question:
             *          case nlpSentenceGenericType.normal_unfinished:
             *          case nlpSentenceGenericType.normal_unknown:
             *              paragraph.genericType = nlpParagraphGenericType.textual_single;
             *              break;
             *          case nlpSentenceGenericType.role_title:
             *              paragraph.genericType = nlpParagraphGenericType.textual_title;
             *              break;
             *          case nlpSentenceGenericType.role_simpleText:
             *              paragraph.genericType = nlpParagraphGenericType.data_single;
             *              break;
             *          default:
             *              paragraph.genericType = nlpParagraphGenericType.data_single;
             *              break;
             *
             *      }
             *  } else
             *  {
             *      //var stats = paragraph.items.getRankedStats(false);
             *
             *      //nlpSentenceBasicType first = stats.First().Key.convertToBasicEnum<nlpSentenceBasicType>();
             *
             *      nlpSentenceBasicType first = firstSentence.basicType;
             *
             *      switch (first)
             *      {
             *          case nlpSentenceBasicType.normal:
             *              switch (firstSentence.genericType)
             *              {
             *                  case nlpSentenceGenericType.role_simpleText:
             *                  case nlpSentenceGenericType.normal_unknown:
             *                  case nlpSentenceGenericType.role_title:
             *                      paragraph.genericType = nlpParagraphGenericType.textual_article;
             *                      break;
             *                  default:
             *                      paragraph.genericType = nlpParagraphGenericType.textual;
             *                      break;
             *              }
             *              break;
             *          case nlpSentenceBasicType.role:
             *              paragraph.genericType = nlpParagraphGenericType.data_simple;
             *              break;
             *          default:
             *          case nlpSentenceBasicType.unknown:
             *              paragraph.genericType = nlpParagraphGenericType.unknown;
             *              break;
             *          case nlpSentenceBasicType.list:
             *              paragraph.genericType = nlpParagraphGenericType.data_listed;
             *              break;
             *      }
             *
             *
             *  }
             * }*/
        }
Пример #20
0
        /// <summary>
        /// staticka kolekcija basicLanguage objekata
        /// </summary>
        //  public static basicLanguageCollection languages;
        public static basicLanguage getLanguage(string iso2code)
        {
            if (serbian == null)
            {
                serbian = new extendedLanguage();

                if (!File.Exists(PATH_hunspell_aff))
                {
                    throw new aceGeneralException("Hunspell AFF file missing!");
                }
                if (!File.Exists(PATH_hunspell_dict))
                {
                    throw new aceGeneralException("Hunspell DICT file missing!");
                }

                basicLanguage language = new basicLanguage();
                language.affixFilePath       = PATH_hunspell_aff;
                language.dictFilePath        = PATH_hunspell_dict;
                language.languageNativeName  = "Srpski";
                language.languageEnglishName = "Serbian";
                language.iso2Code            = "sr";
                serbian.basic = language;

                if (language.checkHuspell(true))
                {
                }
                else
                {
                }
            }
            return(serbian.basic);


            //// < ---- override

            //String query = "iso2Code='{0}'".FormatWith(iso2code);
            //basicLanguage language = new basicLanguage();
            //try
            //{
            //    Int32 i = languages.loadItems(true, 10, 0, System.Data.LoadOption.OverwriteChanges);
            //    var lang = languages.selectItems<basicLanguage>(query, 1).First();
            //    if (i > 0)
            //    {
            //        // // languages.instances.FirstOrDefault(x => x.iso2Code == iso2code);
            //        if (lang != null)
            //        {
            //            return lang;
            //        } else
            //        {
            //            return serbian.basic;
            //        }
            //    } else
            //    {
            //        return serbian.basic;
            //    }

            //    //.selectItems<basicLanguage>(query, 1).First<basicLanguage>();

            //} catch (Exception ex)
            //{
            //    if (getLanguageFailed) throw new aceGeneralException("Language selection failed second time", ex, serbian, "Language database never loaded");
            //    getLanguageFailed = true;
            //    log.log("Language database not ready --> loading Serbian as default.");
            //    return serbian.basic;
            //}



            ////      imbLanguageFramework.imbLanguageFrameworkManager.languages.selectItemByUnique(AgentSettings.languageIsoCode) as basicLanguage;

            //if (language == null)
            //{
            //    String msg = "Basic language definition for [" + iso2code +
            //                 "] not found in the global languages collection (" +
            //                 imbLanguageFramework.imbLanguageFrameworkManager.languages.Count + ")";
            //    logSystem.log(msg, logType.FatalError, true);
            //    throw new aceGeneralException("Language init failed");
            //}

            //return language;
        }
 /// <summary>
 /// Initializes a new instance of the <see cref="pageruleTitleKnownUniqueWords"/> class.
 /// </summary>
 /// <param name="__parent">The parent.</param>
 public pageruleTitleKnownUniqueWords(spiderEvaluatorSimpleBase __parent, basicLanguage __language) : base("Title known unique word", "If the page title contains an unique word (3+ chars long) recognized by Hunspell dictionary", 10, -5, __parent)
 {
     language = __language;
 }
Пример #22
0
 /// <summary>
 /// Initializes a new instance of the <see cref="ruleKnownWordInCaption"/> class.
 /// </summary>
 /// <param name="__parent">The parent.</param>
 public ruleKnownWordInCaption(spiderEvaluatorSimpleBase __parent, basicLanguage __language) : base("Caption known words", "If all words in link caption were recognized by Hunspell dictionary", 4, -1, __parent)
 {
     language = __language;
 }