public void TestMailTo() { var inputText = $" mailTo:dale:[email protected] and then mailto://[email protected]"; var urlDetector = new UrlDetector(inputText, UrlDetectorOptions.Default | UrlDetectorOptions.HTML); var urls = urlDetector.Detect(); urls.ForEach(u => u.GetScheme()); urls.ForEach(u => u.GetHost()); Assert.Equal(2, urls.Count); }
public PosTaggerModelBuilder(string templateFilename, LanguageTypeEnum languageType, UrlDetectorConfig urlDetectorConfig) { templateFilename.ThrowIfNullOrWhiteSpace("templateFilename"); urlDetectorConfig.ThrowIfNull("urlDetectorConfig"); _posTaggerScriber = PosTaggerScriber.Create4ModelBuilder(templateFilename); _posTaggerInputTypeProcessor = CreatePosTaggerInputTypeProcessor(languageType); _urlDetector = new UrlDetector(urlDetectorConfig); _words = new List <Word>(); }
public void TestUriSchemeLocators() { foreach (var schemeName in UriSchemeLookup.UriSchemeNames) { var urlToFind1 = $"{schemeName}://mytestsite.com"; var urlToFind2 = $"{schemeName}%3a//othersite.org"; var inputText = $"did we @>> << !!://JK find #4jadsfj the url: {urlToFind1} and this one too {urlToFind2} ?"; var urlDetector = new UrlDetector(inputText, UrlDetectorOptions.HTML, new HashSet <string> { schemeName }); var urls = urlDetector.Detect(); urls.ForEach(u => u.GetScheme()); urls.ForEach(u => u.GetHost()); Assert.Equal(2, urls.Count); } }
/// <summary> /// Returns a url given a single url. /// </summary> /// <param name="url"></param> /// <returns></returns> public static Url Create(string url) { var formattedString = UrlUtil.RemoveSpecialSpaces(url.Trim().Replace(" ", "%20")); var urls = new UrlDetector(formattedString, UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN).Detect(); if (urls.Count == 1) { return(urls[0]); } if (urls.Count == 0) { throw new MalformedUrlException("We couldn't find any urls in string: " + url); } throw new MalformedUrlException("We found more than one url in string: " + url); }
private void RunTest(string text, UrlDetectorOptions options, params string[] expected) { //do the detection var parser = new UrlDetector(text, options); var found = parser.Detect(); var foundArray = new string[found.Count]; for (var i = 0; i < foundArray.Length; i++) { foundArray[i] = found[i].GetOriginalUrl(); } // All expected items found, ordering irrelevant var areSame = !expected.Except(foundArray).Any() && expected.Length == foundArray.Length; Assert.True(areSame); }
public string[] GetUrls(string body) { UrlDetector detector = new UrlDetector(body, UrlDetectorOptions.QUOTE_MATCH | UrlDetectorOptions.SINGLE_QUOTE_MATCH | UrlDetectorOptions.BRACKET_MATCH | UrlDetectorOptions.JSON | UrlDetectorOptions.JAVASCRIPT | UrlDetectorOptions.XML | UrlDetectorOptions.HTML | UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN, validSchemes: new HashSet <string> { "http", "https", "ftp", "ftps", "ws", "wss" }); var discoveredUrls = detector.Detect(); if (discoveredUrls == null || !discoveredUrls.Any()) { return(Array.Empty <string>()); } return(discoveredUrls .Where(x => { if (x == null) { return false; } // Check to see if it's an IP address. If so, we can skip the TLD check. // Even if it doesn't parse to a C# Uri, it _may_ still be valid-enough, so run the TLD check on it. if (Uri.TryCreate(x.GetFullUrl(), UriKind.Absolute, out Uri? parsedUri)) { if (parsedUri.HostNameType == UriHostNameType.IPv4 || parsedUri.HostNameType == UriHostNameType.IPv6) { return true; } } // TLD check, to make sure we don't pick up files return _tlds.Value.Any(tld => x.GetHost().EndsWith($".{tld}")); }) .Select(x => x.GetFullUrl()) .ToArray() !); }
private Tokenizer(TokenizerConfig4NerModelBuilder config) { config.UrlDetectorConfig.UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position; _urlDetector = new UrlDetector(config.UrlDetectorConfig); _buildModelSentence = Sentence.CreateEmpty(); _words = new List <Word>(DEFAULT_WORDSLIST_CAPACITY); _buildModelWords = new List <Buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY); _particleThatExclusion = config.Model.ParticleThatExclusion; _UIM = XlatUnsafe.Inst._UPPER_INVARIANT_MAP; _CTM = XlatUnsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); _posTaggerInputTypeProcessor = DummyPosTaggerInputTypeProcessor.Instance; _nerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); }
private Tokenizer(TokenizerConfig4NerModelBuilder config) { _UrlDetector = new UrlDetector(new UrlDetectorConfig() { Model = config.UrlDetectorConfig.Model, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position }); _BuildModelSent = sent_t.CreateEmpty(); _Words = new List <word_t>(DEFAULT_WORDSLIST_CAPACITY); _BuildModelWords = new List <buildmodel_word_t>(DEFAULT_WORDSLIST_CAPACITY); _ParticleThatExclusion = config.Model.ParticleThatExclusion; _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _CCTM = UnsafeConst.GetInstanceByLanguage(config.LanguageType)._CRF_CHARTYPE_MAP; //UnsafeConst.Inst._CRF_CHARTYPE_MAP; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); _PosTaggerInputTypeProcessor = Dummy_PosTaggerInputTypeProcessor.Instance; _NerInputTypeProcessor = config.NerInputTypeProcessorFactory.CreateInstance(); }
public mld_tokenizer(UrlDetectorModel urlModel, int wordCapacity) { var urlConfig = new UrlDetectorConfig() { Model = urlModel, UrlExtractMode = UrlDetector.UrlExtractModeEnum.Position, }; _UrlDetector = new UrlDetector(urlConfig); _Words = new List <string>(Math.Max(DEFAULT_WORDCAPACITY, wordCapacity)); _NgramsSB = new StringBuilder(); _AddWordToListAction = new Action <string>(AddWordToList); _UIM = xlat_Unsafe.Inst._UPPER_INVARIANT_MAP; _CTM = xlat_Unsafe.Inst._CHARTYPE_MAP; _IAW = UnsafeConst.Inst._INTERPRETE_AS_WHITESPACE; _DWC = UnsafeConst.Inst._DIGIT_WORD_CHARS; //--// ReAllocWordToUpperBuffer(DEFAULT_WORDTOUPPERBUFFER); }