/// <summary> /// Add a path to a url file to parse. /// </summary> public void AddUrlFile(string path) { _lock.Take(); // has the extractor been setup? if (_fileExtractor == null) { // no, create an extractor of urls Extract extract = new ExtractUrl( new ActionSet <string, Crawler>(ParseControl.OnUrl), new ArrayRig <Protocol>(ParseControl.Protocols)); // start a file extractor for the files with the url extractor _fileExtractor = new FileExtractor(new ArrayRig <string>(new [] { path }), extract); _fileExtractor.OnFile = new ActionSet <string>(OnCompleteUrlFile); _fileExtractor.Run(); } else { // yes, add the file to the extractor _fileExtractor.AddFile(path); } _lock.Release(); }
/// <summary> /// Create the extraction structure for the current crawling session. /// </summary> private Teple <ParseUrl, Parse> GetParsers(Crawler crawler) { // have the extractors been created? if (ExtractorUrl == null) { // no, create an extractor of urls ExtractorUrl = new ExtractUrl(null, new ArrayRig <Protocol>(Protocols)); } ParseUrl parseUrl = (ParseUrl)ExtractorUrl.GetParser(); parseUrl.OnUrl = new ActionSet <string, Crawler>(OnUrl, null, crawler); Parse parseElse = _session.PageExtractor == null ? null : _session.PageExtractor.GetParser(); return(new Teple <ParseUrl, Parse>(parseUrl, parseElse)); }
public override IDocEntity SaveChanges(bool ignoreCache, DocConstantPermission permission) { ExtractUrl = ExtractUrl?.TrimAndPruneSpaces(); return(base.SaveChanges(ignoreCache, permission)); }