Пример #1
0
 internal static void blockAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language)
 {
     if (content.items == null)
     {
         return;
     }
     foreach (IContentBlock block in content.items)
     {
     }
 }
Пример #2
0
        public spiderWebLoader(performanceDataLoad __dataLoad = null)
        {
            webclientSettings = imbWEMManager.settings.loaderComponent.webclientSettings;

            crawlerFlags = crawlerAgentFlag.detectAndProcessLinkNodes | crawlerAgentFlag.detectAndProcessMetaNodes | crawlerAgentFlag.runSaveContentBlock;

            tokenSettings = new nlpTokenizatorSettings();

            // tokenizatorEngine = new htmlSmartTokenizator(tokenSettings);

            trSetup = imbWEMManager.settings.contentProcessor.textRetrieve;

            dataLoad = __dataLoad;
        }
Пример #3
0
        public contentSyllable(string __content, IContentToken __parent, nlpTokenizatorSettings settings)
        {
            content = __content;
            parent  = __parent;
            //sourceContent = __content;


            Match mv = settings.vowelLastRegex.Match(content);

            spliter = mv.Value;


            type = nlpSyllableType.unknown;

            if (tokenization.isNumericStart.IsMatch(content))
            {
                type = nlpSyllableType.numeric;
            }
            else if (tokenization.isLetterStart.IsMatch(content))
            {
                if (settings.syllabLengthLimit == -1)
                {
                    type = nlpSyllableType.regular;
                }
                else
                {
                    if (content.Length > settings.syllabLengthLimit)
                    {
                        type = nlpSyllableType.irregular;
                    }
                    else
                    {
                        type = nlpSyllableType.regular;
                    }
                }
            }
            else
            {
                if (content.Length > 0)
                {
                    type = nlpSyllableType.symbol;
                }
            }
        }
Пример #4
0
 /// <summary>
 /// OSNOVNA ANALIZA TOKENA: sprovodi od FAZE 1 do FAZE 3 - poziva nlpBase alate za svaki od tokena
 /// </summary>
 /// <param name="content"></param>
 /// <param name="language"></param>
 internal static void tokenAnalysis(IContentPage content, nlpTokenizatorSettings settings, basicLanguage language)
 {
     /*
      * if (content.tokens == null)
      * {
      *  return ;
      * }
      *
      * // FAZA 1>
      * if (settings.doTokenTypeDetection_basic)
      * foreach (IContentToken tk in content.tokens)
      * {
      *  tk.genericType = tokenCategorization.findGenericTypeBasic(tk);
      * }
      *
      * // FAZA 2>
      * if (settings.doTokenTypeDetection_second)
      * foreach (IContentToken tk in content.tokens)
      * {
      *
      *  // izgradnja syllables-a
      * //  tk.syllablesDetection(settings);
      *
      *  tk.genericType = tokenCategorization.findGenericTypeSecond(tk, language);
      * }
      *
      *
      * // Faza 3
      * if (settings.doTokenTypeDetection_languageBasic)
      * foreach (IContentToken tk in content.tokens)
      * {
      *  deployTokenLanguageBasic(tk, language);
      * }
      *
      *
      * // Faza 4>
      * if (settings.doTokenTypeDetection_languageAdvanced)
      * foreach (IContentToken tk in content.tokens)
      * {
      *  deployTokenLanguage(tk, language);
      * }
      */
 }
        /// <summary>
        /// Poziva detekciju generickih tipova za recenice, paragrafe i tokene. Odmrzava kolekcije u tokenizedContent
        /// </summary>
        /// <param name="tokenizedContent"></param>
        /// <param name="settings"></param>
        /// <param name="language"></param>
        /// <returns></returns>
        public static string detectGenericTypes(IContentPage tokenizedContent, nlpTokenizatorSettings settings,
                                                basicLanguage language)
        {
            if (settings == null)
            {
                return("");
            }
            if (tokenizedContent == null)
            {
                return("");
            }

            StringBuilder sb = new StringBuilder();

            //if (settings.doTokenTypeDetection_basic)
            //{
            //    tokenCategorization.tokenAnalysis(tokenizedContent, settings, language);
            //}

            if (settings.doSentenceDetection)
            {
                blokCategorization.sentenceAnalysis(tokenizedContent, settings, language);
            }

            if (settings.doParagraphDetection)
            {
                blokCategorization.paragraphAnalysis(tokenizedContent, settings, language);
            }

            if (settings.doBlockDetection)
            {
                blokCategorization.blockAnalysis(tokenizedContent, settings, language);
            }

            /*
             * tokenizedContent.tokens.unfreeze();
             * tokenizedContent.paragraphs.unfreeze();
             * tokenizedContent.sentences.unfreeze();
             * tokenizedContent.items.unfreeze();
             */
            return(sb.ToString());
        }
Пример #6
0
        //public static nlpSentenceBasicType getBasicType(nlpSentenceGenericType input)
        //{
        //    String name = input.ToString();
        //    if (name.StartsWith("normal")) return nlpSentenceBasicType.normal;
        //    if (name.StartsWith("open")) return nlpSentenceBasicType.open;
        //    if (name.StartsWith("role")) return nlpSentenceBasicType.role;
        //    return nlpSentenceBasicType.unknown;
        //}

        internal static void sentenceAnalysis(IContentPage content, nlpTokenizatorSettings settings,
                                              basicLanguage language)
        {
            return;

            /*
             * foreach (IContentSentence sentence in content.sentences)
             * {
             *  if (sentence.genericType == nlpSentenceGenericType.unknown)
             *  {
             *      String spliter = sentence.spliter.Trim();
             *
             *      Boolean firstCaseOk = (sentence.items.First().letterCase == nlpTextCase.firstUpperRestLower);
             *
             *      var prevSentence = sentence.prev as IContentSentence;
             *      if (prevSentence == null) prevSentence = sentence;
             *
             *      if (prevSentence.genericType == nlpSentenceGenericType.list_startSentence)
             *      {
             *          sentence.genericType = nlpSentenceGenericType.list_item;
             *      } else
             *      {
             *          switch (spliter)
             *          {
             *              case tokenization.sentenceEnd_arrowRight:
             *              case tokenization.sentenceEnd_arrowLeft:
             *                  sentence.genericType = nlpSentenceGenericType.role_title;
             *                  break;
             *              case tokenization.sentenceEnd_notFinished2:
             *              case tokenization.sentenceEnd_notFinished:
             *                  if (firstCaseOk)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.normal_unfinished;
             *                  }
             *                  break;
             *              case tokenization.sentenceEnd_question:
             *                  if (firstCaseOk)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.normal_question;
             *                  }
             *
             *                  break;
             *              case tokenization.sentenceEnd_normal:
             *                  if (firstCaseOk)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.normal;
             *                  }
             *                  break;
             *              case tokenization.sentenceEnd_listStart2:
             *              case tokenization.sentenceEnd_listStart:
             *                  sentence.genericType = nlpSentenceGenericType.list_startSentence;
             *                  break;
             *              case tokenization.sentenceEnd_listItemEnd_listEnd:
             *              case tokenization.sentenceEnd_listItemEnd:
             *                  sentence.genericType = nlpSentenceGenericType.list_item;
             *                  break;
             *              case tokenization.sentenceEnd_exclamation:
             *                  if (firstCaseOk)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.normal_exclamation;
             *                  }
             *                  break;
             *              default:
             *                  if (prevSentence.genericType == nlpSentenceGenericType.list_item)
             *                  {
             *                      sentence.genericType = nlpSentenceGenericType.list_item;
             *                  }
             *                  else
             *                  {
             *                      if (!String.IsNullOrEmpty(spliter))
             *                      {
             *                          content.note(devNoteType.nlp,
             *                                              "Unknown spliter for sentence: [" + spliter +
             *                                              "] - add support for it in> tokenization.cs constants and sentenceAnalysis()",
             *                                              "blokCategorization");
             *                      }
             *                  }
             *                  break;
             *
             *          }
             *      }
             *
             *      /*
             *      if (sentence.genericType==nlpSentenceGenericType.unknown)
             *      {
             *          if (firstCaseOk)
             *          {
             *              sentence.genericType = nlpSentenceGenericType.normal_unknown;
             *          } else
             *          {
             *              if (sentence.items.All(x => x.letterCase == nlpTextCase.upperCase))
             *              {
             *                  sentence.genericType = nlpSentenceGenericType.role_title;
             *              } else
             *              {
             *                  sentence.genericType = nlpSentenceGenericType.role_simpleText;
             *              }
             *          }
             *      }
             *  }
             *
             * }
             *
             * }*/
        }
Пример #7
0
        /// <summary>
        /// Vrsi analizu paragrafa - za svaki paragraf unfreeze kolekciju recenica, pokrenuti obavezno posle kategorizacije recenica
        /// </summary>
        /// <param name="content"></param>
        /// <param name="settings"></param>
        /// <param name="language"></param>
        internal static void paragraphAnalysis(IContentPage content, nlpTokenizatorSettings settings,
                                               basicLanguage language)
        {
            if (content.paragraphs == null)
            {
                return;
            }

            /*
             * foreach (IContentParagraph paragraph in content.paragraphs)
             * {
             *  //paragraph.items.unfreeze();
             *  var firstSentence = paragraph.items.First();
             *
             *  if (paragraph.items.Count == 1)
             *  {
             *      switch (firstSentence.genericType)
             *      {
             *          case nlpSentenceGenericType.normal:
             *          case nlpSentenceGenericType.normal_exclamation:
             *          case nlpSentenceGenericType.normal_question:
             *          case nlpSentenceGenericType.normal_unfinished:
             *          case nlpSentenceGenericType.normal_unknown:
             *              paragraph.genericType = nlpParagraphGenericType.textual_single;
             *              break;
             *          case nlpSentenceGenericType.role_title:
             *              paragraph.genericType = nlpParagraphGenericType.textual_title;
             *              break;
             *          case nlpSentenceGenericType.role_simpleText:
             *              paragraph.genericType = nlpParagraphGenericType.data_single;
             *              break;
             *          default:
             *              paragraph.genericType = nlpParagraphGenericType.data_single;
             *              break;
             *
             *      }
             *  } else
             *  {
             *      //var stats = paragraph.items.getRankedStats(false);
             *
             *      //nlpSentenceBasicType first = stats.First().Key.convertToBasicEnum<nlpSentenceBasicType>();
             *
             *      nlpSentenceBasicType first = firstSentence.basicType;
             *
             *      switch (first)
             *      {
             *          case nlpSentenceBasicType.normal:
             *              switch (firstSentence.genericType)
             *              {
             *                  case nlpSentenceGenericType.role_simpleText:
             *                  case nlpSentenceGenericType.normal_unknown:
             *                  case nlpSentenceGenericType.role_title:
             *                      paragraph.genericType = nlpParagraphGenericType.textual_article;
             *                      break;
             *                  default:
             *                      paragraph.genericType = nlpParagraphGenericType.textual;
             *                      break;
             *              }
             *              break;
             *          case nlpSentenceBasicType.role:
             *              paragraph.genericType = nlpParagraphGenericType.data_simple;
             *              break;
             *          default:
             *          case nlpSentenceBasicType.unknown:
             *              paragraph.genericType = nlpParagraphGenericType.unknown;
             *              break;
             *          case nlpSentenceBasicType.list:
             *              paragraph.genericType = nlpParagraphGenericType.data_listed;
             *              break;
             *      }
             *
             *
             *  }
             * }*/
        }
Пример #8
0
        /// <summary>
        /// pod FAZA 2.a: detektovanje slogova - poziva ga faza 2, nema potrebe posebno pozivati --- ne radi dobro
        /// </summary>
        /// <param name="token"></param>
        /// <param name="language"></param>
        public void syllablesDetection(nlpTokenizatorSettings settings)
        {
            return;

            /*
             * //token.i = new List<nlpSyllable>();
             * MatchCollection coll = null;
             *
             * switch (tokenBaseType)
             * {
             *  case nlpTokenBaseType.word:
             *      if (genericType == nlpTokenGenericType.wordAbrevation)
             *      {
             *          coll = tokenization.samoRec.Matches(sourceContent);
             *      }
             *      else
             *      {
             *          if (settings.vowelRegex.IsMatch(content))
             *          {
             *              coll = settings.vowelRegex.Matches(content);
             *          }
             *      }
             *      break;
             *
             *  case nlpTokenBaseType.number:
             *      //if (genericType == nlpTokenGenericType.numberFormated)
             *      //{
             *      //    String[] npt = nlpTokenizator.numberFormatSymbols.Split(sourceContent);
             *      //    foreach (String smc in npt)
             *      //    {
             *      //        setItem(new nlpSyllable(smc, this, language));
             *      //    }
             *      //}
             *      coll = tokenization.numericSelect.Matches(sourceContent);
             *      break;
             *
             *  case nlpTokenBaseType.mixed:
             *      coll = tokenization.samoRec.Matches(sourceContent);
             *      //String[] prts = nlpTokenizator.selectLetterToOtherChanges.Split(sourceContent);
             *      //foreach (String smc in prts)
             *      //{
             *      //    setItem(new nlpSyllable(smc, this, language));
             *      //}
             *      break;
             *
             *  default:
             *      return;
             *      break;
             * }
             *
             * Int32 lastIndex = 0;
             * String start = "";
             * String ende = "";
             * contentSyllable last = null;
             * if (coll == null)
             * {
             * }
             * else
             * {
             *  foreach (Match mc in coll)
             *  {
             *      last = setItem(new contentSyllable(mc.Value, this, settings)) as contentSyllable;
             *
             *      if ((lastIndex == 0) && (mc.Index > 0))
             *      {
             *          start = content.Substring(0, mc.Index);
             *          setItem(new contentSyllable(start, this, settings));
             *      }
             *      lastIndex = mc.Index + mc.Length;
             *
             *      start = "";
             *  }
             *  if (last != null)
             *  {
             *      if (lastIndex < content.Length)
             *      {
             *          ende = content.Substring(lastIndex);
             *          setItem(new contentSyllable(ende, this, settings));
             *      }
             *  }
             *
             *  if (this.items.Count == 0)
             *  {
             *      last = setItem(new contentSyllable(content, this, settings)) as contentSyllable;
             *  }
             * }
             * //syllablesLine = rebuildSyllLine();
             * */
        }
Пример #9
0
        public xPathQueryCache _xpath_allNodesWithText; // = new xPathQueryCache()

        /// <summary>
        /// Initializes a new instance of the <see cref="htmlSmartTokenizator"/> class.
        /// </summary>
        /// <param name="__settings">The settings.</param>
        public htmlSmartTokenizator(nlpTokenizatorSettings __settings) : base(__settings)
        {
        }