Exemplo n.º 1
0
        /// <summary>
        /// Add a path to a url file to parse.
        /// </summary>
        public void AddUrlFile(string path)
        {
            _lock.Take();

            // has the extractor been setup?
            if (_fileExtractor == null)
            {
                // no, create an extractor of urls
                Extract extract = new ExtractUrl(
                    new ActionSet <string, Crawler>(ParseControl.OnUrl),
                    new ArrayRig <Protocol>(ParseControl.Protocols));

                // start a file extractor for the files with the url extractor
                _fileExtractor        = new FileExtractor(new ArrayRig <string>(new [] { path }), extract);
                _fileExtractor.OnFile = new ActionSet <string>(OnCompleteUrlFile);
                _fileExtractor.Run();
            }
            else
            {
                // yes, add the file to the extractor
                _fileExtractor.AddFile(path);
            }

            _lock.Release();
        }
Exemplo n.º 2
0
        /// <summary>
        /// Create the extraction structure for the current crawling session.
        /// </summary>
        private Teple <ParseUrl, Parse> GetParsers(Crawler crawler)
        {
            // have the extractors been created?
            if (ExtractorUrl == null)
            {
                // no, create an extractor of urls
                ExtractorUrl = new ExtractUrl(null, new ArrayRig <Protocol>(Protocols));
            }

            ParseUrl parseUrl = (ParseUrl)ExtractorUrl.GetParser();

            parseUrl.OnUrl = new ActionSet <string, Crawler>(OnUrl, null, crawler);

            Parse parseElse = _session.PageExtractor == null ? null : _session.PageExtractor.GetParser();

            return(new Teple <ParseUrl, Parse>(parseUrl, parseElse));
        }
Exemplo n.º 3
0
 public override IDocEntity SaveChanges(bool ignoreCache, DocConstantPermission permission)
 {
     ExtractUrl = ExtractUrl?.TrimAndPruneSpaces();
     return(base.SaveChanges(ignoreCache, permission));
 }