Example #1
0
        public void TestMailTo()
        {
            var inputText   = $" mailTo:dale:[email protected] and then mailto://[email protected]";
            var urlDetector = new UrlDetector(inputText, UrlDetectorOptions.Default | UrlDetectorOptions.HTML);
            var urls        = urlDetector.Detect();

            urls.ForEach(u => u.GetScheme());
            urls.ForEach(u => u.GetHost());
            Assert.Equal(2, urls.Count);
        }
Example #2
0
        public PosTaggerModelBuilder(string templateFilename,
                                     LanguageTypeEnum languageType,
                                     UrlDetectorConfig urlDetectorConfig)
        {
            templateFilename.ThrowIfNullOrWhiteSpace("templateFilename");
            urlDetectorConfig.ThrowIfNull("urlDetectorConfig");

            _posTaggerScriber            = PosTaggerScriber.Create4ModelBuilder(templateFilename);
            _posTaggerInputTypeProcessor = CreatePosTaggerInputTypeProcessor(languageType);
            _urlDetector = new UrlDetector(urlDetectorConfig);
            _words       = new List <Word>();
        }
Example #3
0
 public void TestUriSchemeLocators()
 {
     foreach (var schemeName in UriSchemeLookup.UriSchemeNames)
     {
         var urlToFind1  = $"{schemeName}://mytestsite.com";
         var urlToFind2  = $"{schemeName}%3a//othersite.org";
         var inputText   = $"did we @>> << !!://JK find #4jadsfj the url: {urlToFind1} and this one too {urlToFind2} ?";
         var urlDetector = new UrlDetector(inputText, UrlDetectorOptions.HTML, new HashSet <string> {
             schemeName
         });
         var urls = urlDetector.Detect();
         urls.ForEach(u => u.GetScheme());
         urls.ForEach(u => u.GetHost());
         Assert.Equal(2, urls.Count);
     }
 }
Example #4
0
        /// <summary>
        /// Returns a url given a single url.
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static Url Create(string url)
        {
            var formattedString = UrlUtil.RemoveSpecialSpaces(url.Trim().Replace(" ", "%20"));
            var urls            = new UrlDetector(formattedString, UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN).Detect();

            if (urls.Count == 1)
            {
                return(urls[0]);
            }

            if (urls.Count == 0)
            {
                throw new MalformedUrlException("We couldn't find any urls in string: " + url);
            }

            throw new MalformedUrlException("We found more than one url in string: " + url);
        }
Example #5
0
        private void RunTest(string text, UrlDetectorOptions options, params string[] expected)
        {
            //do the detection
            var parser     = new UrlDetector(text, options);
            var found      = parser.Detect();
            var foundArray = new string[found.Count];

            for (var i = 0; i < foundArray.Length; i++)
            {
                foundArray[i] = found[i].GetOriginalUrl();
            }

            // All expected items found, ordering irrelevant
            var areSame = !expected.Except(foundArray).Any() && expected.Length == foundArray.Length;

            Assert.True(areSame);
        }
        public string[] GetUrls(string body)
        {
            UrlDetector detector = new UrlDetector(body,
                                                   UrlDetectorOptions.QUOTE_MATCH |
                                                   UrlDetectorOptions.SINGLE_QUOTE_MATCH |
                                                   UrlDetectorOptions.BRACKET_MATCH |
                                                   UrlDetectorOptions.JSON |
                                                   UrlDetectorOptions.JAVASCRIPT |
                                                   UrlDetectorOptions.XML |
                                                   UrlDetectorOptions.HTML |
                                                   UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN,
                                                   validSchemes: new HashSet <string> {
                "http", "https", "ftp", "ftps", "ws", "wss"
            });

            var discoveredUrls = detector.Detect();

            if (discoveredUrls == null || !discoveredUrls.Any())
            {
                return(Array.Empty <string>());
            }
            return(discoveredUrls
                   .Where(x =>
            {
                if (x == null)
                {
                    return false;
                }

                // Check to see if it's an IP address. If so, we can skip the TLD check.
                // Even if it doesn't parse to a C# Uri, it _may_ still be valid-enough, so run the TLD check on it.
                if (Uri.TryCreate(x.GetFullUrl(), UriKind.Absolute, out Uri? parsedUri))
                {
                    if (parsedUri.HostNameType == UriHostNameType.IPv4 || parsedUri.HostNameType == UriHostNameType.IPv6)
                    {
                        return true;
                    }
                }

                // TLD check, to make sure we don't pick up files
                return _tlds.Value.Any(tld => x.GetHost().EndsWith($".{tld}"));
            })
                   .Select(x => x.GetFullUrl())
                   .ToArray() !);
        }
Example #7
0
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position;

            _urlDetector        = new UrlDetector(config.UrlDetectorConfig);
            _buildModelSentence = Sentence.CreateEmpty();
            _words                 = new List <Word>(DEFAULT_WORDSLIST_CAPACITY);
            _buildModelWords       = new List <Buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _particleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = XlatUnsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = XlatUnsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;

            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _posTaggerInputTypeProcessor = DummyPosTaggerInputTypeProcessor.Instance;
            _nerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }
Example #8
0
        private Tokenizer(TokenizerConfig4NerModelBuilder config)
        {
            _UrlDetector = new UrlDetector(new UrlDetectorConfig()
            {
                Model = config.UrlDetectorConfig.Model, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position
            });
            _BuildModelSent        = sent_t.CreateEmpty();
            _Words                 = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _BuildModelWords       = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY);
            _ParticleThatExclusion = config.Model.ParticleThatExclusion;

            _UIM  = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM  = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP;   //UnsafeConst.Inst._CRF_CHARTYPE_MAP;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);

            _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance;
            _NerInputTypeProcessor       = config.NerInputTypeProcessorFactory.CreateInstance();
        }
Example #9
0
        public mld_tokenizer(UrlDetectorModel urlModel, int wordCapacity)
        {
            var urlConfig = new UrlDetectorConfig()
            {
                Model          = urlModel,
                UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position,
            };

            _UrlDetector         = new UrlDetector(urlConfig);
            _Words               = new List <string>(Math.Max(DEFAULT_WORDCAPACITY, wordCapacity));
            _NgramsSB            = new StringBuilder();
            _AddWordToListAction = new Action <string>(AddWordToList);

            _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP;
            _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP;
            _IAW = UnsafeConst.Inst._INTERPRETE_AS_WHITESPACE;
            _DWC = UnsafeConst.Inst._DIGIT_WORD_CHARS;

            //--//
            ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER);
        }