/// <summary> /// Loads the specified folder. /// </summary> /// <param name="folder">The folder.</param> /// <param name="options">The options.</param> public void Load(String path, WebDomainCategoryFormatOptions options = WebDomainCategoryFormatOptions.saveReadmeFile | WebDomainCategoryFormatOptions.saveAggregate | WebDomainCategoryFormatOptions.normalizeDomainname, ILogBuilder logger = null) { DirectoryInfo di = new DirectoryInfo(path); FileInfo rootList = di.GetFiles(categorySiteList, SearchOption.TopDirectoryOnly).FirstOrDefault(); //folder.findFile(categorySiteList, SearchOption.TopDirectoryOnly); if (rootList != null) { LoadDomainList(rootList.FullName, options); } List <FileInfo> sampleFiles = di.GetFiles(categorySiteList, SearchOption.AllDirectories).ToList(); foreach (var fi in sampleFiles) { String pathForCategory = fi.DirectoryName.removeStartsWith(di.FullName); //Path.GetDirectoryName(fi).removeStartsWith(folder.path); WebDomainCategory cat = graphTools.ConvertPathToGraph <WebDomainCategory>(this, pathForCategory, false, Path.DirectorySeparatorChar.ToString()); //Add(pathForCategory) as WebDomainCategory; cat.LoadDomainList(fi.FullName, options, logger); //fi.FullName.Remove(folder.) } }
/// <summary> /// Loads the dataset. /// </summary> /// <param name="path">The path.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public WebDocumentsCategory LoadDataset(String path, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { WebDocumentsCategory output = new WebDocumentsCategory(); if (path.isNullOrEmpty()) { throw new ArgumentException("Path is empty or null", nameof(path)); } DirectoryInfo dir = new DirectoryInfo(path); output.name = dir.Name; if (!dir.Exists) { if (logger != null) { logger.log("Directory " + path + " not found!"); } return(output); } LoadDirectory(output, dir, options, logger); return(output); }
/// <summary> /// Loads the directory. /// </summary> /// <param name="category">The category.</param> /// <param name="di">The di.</param> /// <param name="options">The options.</param> /// <param name="logger">The logger.</param> private void LoadDirectory(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { DirectoryInfo[] dirList = di.GetDirectories(); foreach (DirectoryInfo dir in dirList) { WebDocumentsCategory child = category.CreateChildItem(dir.Name) as WebDocumentsCategory; LoadWebSites(child, dir, options, logger); LoadDirectory(child, dir, options, logger); } }
/// <summary> /// Saves the specified folder. /// </summary> /// <param name="folder">The folder.</param> /// <param name="options">The options.</param> public void Save(String pathToSave, WebDomainCategoryFormatOptions options = WebDomainCategoryFormatOptions.saveReadmeFile | WebDomainCategoryFormatOptions.saveAggregate | WebDomainCategoryFormatOptions.normalizeDomainname, ILogBuilder logger = null) { DirectoryInfo di = new DirectoryInfo(pathToSave); folderNode folder = di; String domainList = GetDomainList(options, logger); String path = folder.pathFor(categorySiteList, imbSCI.Data.enums.getWritableFileMode.overwrite, "Web sites at this category", true); File.WriteAllText(path, domainList); StringBuilder sb = new StringBuilder(); foreach (WebDomainCategory category in this) { var subFolder = folder.Add(category.name, category.name, "Subcategory"); category.Save(subFolder.path, options, logger); if (options.HasFlag(WebDomainCategoryFormatOptions.saveAggregate)) { List <string> sites = GetSites(100); sites.ForEach(x => sb.AppendLine(x)); //sb.AppendLine(GetDomainList(options, logger)); } } if (options.HasFlag(WebDomainCategoryFormatOptions.saveAggregate)) { path = folder.pathFor(categoryAggregateSiteList, imbSCI.Data.enums.getWritableFileMode.overwrite, "Web sites at this category, including subcategories", true); File.WriteAllText(path, sb.ToString()); } if (this.root == this) { if (options.HasFlag(WebDomainCategoryFormatOptions.saveGraphAtRoot)) { var dgml = GraphConverters.DataSetDomainGraphConverter.Convert(this, 300); //imbSCI.Graph.Converters.GraphConversionTools.DefaultGraphToDGMLConverterInstance.Convert(this, 300); //.ConvertToDGML(this, 300); dgml.Save(folder.pathFor("dataset.dgml", imbSCI.Data.enums.getWritableFileMode.overwrite, "DirectedGraphMarkupLanguage representation of categories", true), imbSCI.Data.enums.getWritableFileMode.overwrite); } if (options.HasFlag(WebDomainCategoryFormatOptions.saveReadmeFile)) { folder.generateReadmeFiles(imbACE.Core.appManager.AppInfo); } } }
/// <summary> /// Loads the domain list. /// </summary> /// <param name="path">The path.</param> /// <param name="options">The options.</param> public void LoadDomainList(String path, WebDomainCategoryFormatOptions options = WebDomainCategoryFormatOptions.saveReadmeFile | WebDomainCategoryFormatOptions.saveAggregate | WebDomainCategoryFormatOptions.normalizeDomainname, ILogBuilder logger = null) { if (File.Exists(path)) { sites.Clear(); String[] list = File.ReadAllLines(path); foreach (String ln in list) { String s = ln; if (options.HasFlag(WebDomainCategoryFormatOptions.normalizeDomainname)) { domainAnalysis da = new domainAnalysis(s); s = da.urlProper; } sites.Add(s); } } }
/// <summary> /// Gets the domain list. /// </summary> /// <param name="options">The options.</param> /// <returns></returns> public String GetDomainList(WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { StringBuilder sb = new StringBuilder(); foreach (String s in sites) { String ln = s; if (options.HasFlag(WebDomainCategoryFormatOptions.normalizeDomainname)) { domainAnalysis da = new domainAnalysis(s); ln = da.urlProper; } sb.AppendLine(ln); } return(sb.ToString()); }
/// <summary> /// Saves the dataset. /// </summary> /// <param name="dataset">The dataset.</param> /// <param name="path">The path.</param> public void SaveDataset(WebDocumentsCategory dataset, String path, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { folderNode folder = new DirectoryInfo(path); folder.description = dataset.description.add(description, Environment.NewLine); SaveWebSites(dataset, folder, options); if (options.HasFlag(WebDomainCategoryFormatOptions.saveGraphAtRoot)) { var dmgl = GraphConverters.documentsConverter.Convert(dataset, 300); // imbSCI.Graph.Converters.GraphConversionTools.ConvertToDGML<WebDocumentsCategory>(dataset, 300); dmgl.Save(folder.pathFor("dataset", imbSCI.Data.enums.getWritableFileMode.overwrite, "Directed graph of categories in the dataset", true)); var dot = imbSCI.Graph.Converters.GraphConversionTools.ConvertToDOT(dmgl); dot.Save(folder.pathFor("dataset_dot", imbSCI.Data.enums.getWritableFileMode.existing, "DOT graph of categories in the dataset", true)); //var mxgraph = imbSCI.Graph.MXGraph.directedGraphToMXGraph.ConvertToMXGraph(dmgl); } if (options.HasFlag(WebDomainCategoryFormatOptions.saveReadmeFile)) { folder.generateReadmeFiles(imbACE.Core.appManager.AppInfo); } }
/// <summary> /// Loads the web site document. /// </summary> /// <param name="fi">The fi.</param> /// <param name="webSite">The web site.</param> /// <returns></returns> private WebSiteDocument LoadWebSiteDocument(FileInfo fi, WebSiteDocuments webSite, WebDomainCategoryFormatOptions options) { WebSiteDocument output = null; String path = GetURLPathFromFilename(fi.Name); path = path.Substring(path.IndexOf(webSite.domain) + webSite.domain.Length); //path = path.TrimStart('//'); //if (SelectPath.IsMatch(path)) //{ // path = SelectPath.Match(path).Value; //} else //{ // path = path.removeStartsWith("http://" + webSite.domain ); //} output = new WebSiteDocument(path, options.HasFlag(WebDomainCategoryFormatOptions.lazyLoading), fi.FullName); return(output); }
/// <summary> /// Saves the web sites. /// </summary> /// <param name="category">The category.</param> /// <param name="rootFolder">The root folder.</param> protected void SaveWebSites(WebDocumentsCategory category, folderNode rootFolder, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { folderNode folder = rootFolder.Add(category.name, category.name, category.description); StringBuilder domainList = new StringBuilder(); foreach (WebSiteDocuments site in category.siteDocuments) { domainList.AppendLine(site.domain); SaveWebSite(site, folder); /* * foreach (WebSiteDocument page in site.documents) * { * * String filename = site.domain.add(page.path, "/"); * filename = filename.Replace("//", "/"); * filename = "http://" + filename; * filename = GetFilenameFromURLPath(filename); * filename = WebSiteDocumentsSetTools.GetSafeFilename(filename); * * String p = folder.pathFor(filename, imbSCI.Data.enums.getWritableFileMode.existing, "Page of [" + site.domain + "] at path [" + page.path + "]", false); * * String source = GetWebDocumentSource(page); * if (!File.Exists(p)) * { * File.WriteAllText(p, source); * } * }*/ } if (options.HasFlag(WebDomainCategoryFormatOptions.saveDomainList)) { File.WriteAllText(folder.pathFor(WebDomainCategory.categorySiteList, imbSCI.Data.enums.getWritableFileMode.overwrite, "Domains in category [" + category.path + "]", true), domainList.ToString()); } SaveSubcategories(category, folder, options); }
/// <summary> /// Saves the subcategories. /// </summary> /// <param name="category">The category.</param> /// <param name="rootFolder">The root folder.</param> protected void SaveSubcategories(WebDocumentsCategory category, folderNode rootFolder, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { foreach (WebDocumentsCategory subcat in category) { SaveWebSites(subcat, rootFolder, options); } }
/// <summary> /// Loads the web sites. /// </summary> /// <param name="category">The category.</param> /// <param name="di">The di.</param> /// <param name="logger">The logger.</param> private void LoadWebSites(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { FileInfo[] fileList = di.GetFiles(); Dictionary <String, List <FileInfo> > siteFilesIndex = new Dictionary <string, List <FileInfo> >(); if (fileList.Length > 1) { foreach (FileInfo fi in fileList) { String path = GetURLPathFromFilename(fi.Name); if (path.StartsWith("http")) { Match m = SelectDomainName.Match(path); if (m.Success) { String domain = m.Groups[1].Value; if (!siteFilesIndex.ContainsKey(domain)) { siteFilesIndex.Add(domain, new List <FileInfo>()); } siteFilesIndex[domain].Add(fi); } } } if (logger != null) { logger.log("Web sites detected: [" + siteFilesIndex.Count + "]"); } foreach (String k in siteFilesIndex.Keys) { WebSiteDocuments webSite = new WebSiteDocuments(k); List <String> k_list = new List <string>(); foreach (FileInfo fi in siteFilesIndex[k]) { WebSiteDocument d = LoadWebSiteDocument(fi, webSite, options); //if (fi.FullName[fi.FullName.Length - 1] == '7') //{ //} //String filename = webSite.domain.add(d.path, "/"); //filename = filename.Replace("//", "/"); //filename = "http://" + filename; //filename = GetFilenameFromURLPath(filename); //filename = WebSiteDocumentsSetTools.GetSafeFilename(filename); String AssociatedID = WebSiteDocumentsSetTools.GetPageURL(d, webSite); //WebSiteDocumentsSetTools.GetUrlSignature(webSite.domain + d.path); d.AssignedID = AssociatedID; if (k_list.Contains(d.AssignedID)) { } else { k_list.Add(d.AssignedID); webSite.documents.Add(d); } } category.siteDocuments.Add(webSite); if (logger != null) { logger.log(category.path + " -> [" + webSite.domain + "] -> pages [" + webSite.documents.Count + "]"); } } } }