public OpenDictionaryHarvester(String startURL)
 {
     HomeDomain = SelectDomainName.Match(startURL).Groups[1].Value;
     HomeURL    = SelectPath.Match(startURL).Groups[1].Value;
 }
        /// <summary>
        /// Loads the web sites.
        /// </summary>
        /// <param name="category">The category.</param>
        /// <param name="di">The di.</param>
        /// <param name="logger">The logger.</param>
        private void LoadWebSites(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null)
        {
            FileInfo[] fileList = di.GetFiles();

            Dictionary <String, List <FileInfo> > siteFilesIndex = new Dictionary <string, List <FileInfo> >();

            if (fileList.Length > 1)
            {
                foreach (FileInfo fi in fileList)
                {
                    String path = GetURLPathFromFilename(fi.Name);
                    if (path.StartsWith("http"))
                    {
                        Match m = SelectDomainName.Match(path);
                        if (m.Success)
                        {
                            String domain = m.Groups[1].Value;
                            if (!siteFilesIndex.ContainsKey(domain))
                            {
                                siteFilesIndex.Add(domain, new List <FileInfo>());
                            }

                            siteFilesIndex[domain].Add(fi);
                        }
                    }
                }

                if (logger != null)
                {
                    logger.log("Web sites detected: [" + siteFilesIndex.Count + "]");
                }



                foreach (String k in siteFilesIndex.Keys)
                {
                    WebSiteDocuments webSite = new WebSiteDocuments(k);

                    List <String> k_list = new List <string>();

                    foreach (FileInfo fi in siteFilesIndex[k])
                    {
                        WebSiteDocument d = LoadWebSiteDocument(fi, webSite, options);

                        //if (fi.FullName[fi.FullName.Length - 1] == '7')
                        //{

                        //}


                        //String filename = webSite.domain.add(d.path, "/");
                        //filename = filename.Replace("//", "/");
                        //filename = "http://" + filename;
                        //filename = GetFilenameFromURLPath(filename);
                        //filename = WebSiteDocumentsSetTools.GetSafeFilename(filename);



                        String AssociatedID = WebSiteDocumentsSetTools.GetPageURL(d, webSite);  //WebSiteDocumentsSetTools.GetUrlSignature(webSite.domain + d.path);
                        d.AssignedID = AssociatedID;
                        if (k_list.Contains(d.AssignedID))
                        {
                        }
                        else
                        {
                            k_list.Add(d.AssignedID);

                            webSite.documents.Add(d);
                        }
                    }

                    category.siteDocuments.Add(webSite);

                    if (logger != null)
                    {
                        logger.log(category.path + " -> [" + webSite.domain + "] -> pages [" + webSite.documents.Count + "]");
                    }
                }
            }
        }