/// <summary> /// Saves the subcategories. /// </summary> /// <param name="category">The category.</param> /// <param name="rootFolder">The root folder.</param> protected void SaveSubcategories(WebDocumentsCategory category, folderNode rootFolder, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { foreach (WebDocumentsCategory subcat in category) { SaveWebSites(subcat, rootFolder, options); } }
/// <summary> /// Loads the dataset. /// </summary> /// <param name="path">The path.</param> /// <param name="logger">The logger.</param> /// <returns></returns> public WebDocumentsCategory LoadDataset(String path, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { WebDocumentsCategory output = new WebDocumentsCategory(); if (path.isNullOrEmpty()) { throw new ArgumentException("Path is empty or null", nameof(path)); } DirectoryInfo dir = new DirectoryInfo(path); output.name = dir.Name; if (!dir.Exists) { if (logger != null) { logger.log("Directory " + path + " not found!"); } return(output); } LoadDirectory(output, dir, options, logger); return(output); }
public override graphNodeCustom CreateChildItem(string nameForChild) { WebDocumentsCategory output = new WebDocumentsCategory(); output.name = nameForChild; Add(output); return(output); }
public WebDocumentsCategory GetOrAdd(String __path, Boolean isAbsolute) { WebDocumentsCategory cat = graphTools.ConvertPathToGraph <WebDocumentsCategory>(this, __path, isAbsolute, pathSeparator, true); //Add(pathForCategory) as WebDomainCategory; if (cat == this) { } return(cat); }
/// <summary> /// Loads the directory. /// </summary> /// <param name="category">The category.</param> /// <param name="di">The di.</param> /// <param name="options">The options.</param> /// <param name="logger">The logger.</param> private void LoadDirectory(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { DirectoryInfo[] dirList = di.GetDirectories(); foreach (DirectoryInfo dir in dirList) { WebDocumentsCategory child = category.CreateChildItem(dir.Name) as WebDocumentsCategory; LoadWebSites(child, dir, options, logger); LoadDirectory(child, dir, options, logger); } }
/// <summary> /// Sets the categories by enumerable dataset /// </summary> /// <param name="categorySet">The category set.</param> public void SetCategoryByDataset(IEnumerable <WebSiteDocumentsSet> categorySet) { foreach (WebSiteDocumentsSet category in categorySet) { WebDocumentsCategory catChild = GetOrAdd(category.name, false); foreach (WebSiteDocuments site in category) { var existingSite = catChild.siteDocuments.FirstOrDefault(x => x.domain == site.domain); if (existingSite != null) { catChild.siteDocuments.Remove(existingSite); } catChild.siteDocuments.Add(site); } } }
/// <summary> /// Saves the dataset. /// </summary> /// <param name="dataset">The dataset.</param> /// <param name="path">The path.</param> public void SaveDataset(WebDocumentsCategory dataset, String path, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { folderNode folder = new DirectoryInfo(path); folder.description = dataset.description.add(description, Environment.NewLine); SaveWebSites(dataset, folder, options); if (options.HasFlag(WebDomainCategoryFormatOptions.saveGraphAtRoot)) { var dmgl = GraphConverters.documentsConverter.Convert(dataset, 300); // imbSCI.Graph.Converters.GraphConversionTools.ConvertToDGML<WebDocumentsCategory>(dataset, 300); dmgl.Save(folder.pathFor("dataset", imbSCI.Data.enums.getWritableFileMode.overwrite, "Directed graph of categories in the dataset", true)); var dot = imbSCI.Graph.Converters.GraphConversionTools.ConvertToDOT(dmgl); dot.Save(folder.pathFor("dataset_dot", imbSCI.Data.enums.getWritableFileMode.existing, "DOT graph of categories in the dataset", true)); //var mxgraph = imbSCI.Graph.MXGraph.directedGraphToMXGraph.ConvertToMXGraph(dmgl); } if (options.HasFlag(WebDomainCategoryFormatOptions.saveReadmeFile)) { folder.generateReadmeFiles(imbACE.Core.appManager.AppInfo); } }
/// <summary> /// Saves the web sites. /// </summary> /// <param name="category">The category.</param> /// <param name="rootFolder">The root folder.</param> protected void SaveWebSites(WebDocumentsCategory category, folderNode rootFolder, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { folderNode folder = rootFolder.Add(category.name, category.name, category.description); StringBuilder domainList = new StringBuilder(); foreach (WebSiteDocuments site in category.siteDocuments) { domainList.AppendLine(site.domain); SaveWebSite(site, folder); /* * foreach (WebSiteDocument page in site.documents) * { * * String filename = site.domain.add(page.path, "/"); * filename = filename.Replace("//", "/"); * filename = "http://" + filename; * filename = GetFilenameFromURLPath(filename); * filename = WebSiteDocumentsSetTools.GetSafeFilename(filename); * * String p = folder.pathFor(filename, imbSCI.Data.enums.getWritableFileMode.existing, "Page of [" + site.domain + "] at path [" + page.path + "]", false); * * String source = GetWebDocumentSource(page); * if (!File.Exists(p)) * { * File.WriteAllText(p, source); * } * }*/ } if (options.HasFlag(WebDomainCategoryFormatOptions.saveDomainList)) { File.WriteAllText(folder.pathFor(WebDomainCategory.categorySiteList, imbSCI.Data.enums.getWritableFileMode.overwrite, "Domains in category [" + category.path + "]", true), domainList.ToString()); } SaveSubcategories(category, folder, options); }
/// <summary> /// Loads the web sites. /// </summary> /// <param name="category">The category.</param> /// <param name="di">The di.</param> /// <param name="logger">The logger.</param> private void LoadWebSites(WebDocumentsCategory category, DirectoryInfo di, WebDomainCategoryFormatOptions options, ILogBuilder logger = null) { FileInfo[] fileList = di.GetFiles(); Dictionary <String, List <FileInfo> > siteFilesIndex = new Dictionary <string, List <FileInfo> >(); if (fileList.Length > 1) { foreach (FileInfo fi in fileList) { String path = GetURLPathFromFilename(fi.Name); if (path.StartsWith("http")) { Match m = SelectDomainName.Match(path); if (m.Success) { String domain = m.Groups[1].Value; if (!siteFilesIndex.ContainsKey(domain)) { siteFilesIndex.Add(domain, new List <FileInfo>()); } siteFilesIndex[domain].Add(fi); } } } if (logger != null) { logger.log("Web sites detected: [" + siteFilesIndex.Count + "]"); } foreach (String k in siteFilesIndex.Keys) { WebSiteDocuments webSite = new WebSiteDocuments(k); List <String> k_list = new List <string>(); foreach (FileInfo fi in siteFilesIndex[k]) { WebSiteDocument d = LoadWebSiteDocument(fi, webSite, options); //if (fi.FullName[fi.FullName.Length - 1] == '7') //{ //} //String filename = webSite.domain.add(d.path, "/"); //filename = filename.Replace("//", "/"); //filename = "http://" + filename; //filename = GetFilenameFromURLPath(filename); //filename = WebSiteDocumentsSetTools.GetSafeFilename(filename); String AssociatedID = WebSiteDocumentsSetTools.GetPageURL(d, webSite); //WebSiteDocumentsSetTools.GetUrlSignature(webSite.domain + d.path); d.AssignedID = AssociatedID; if (k_list.Contains(d.AssignedID)) { } else { k_list.Add(d.AssignedID); webSite.documents.Add(d); } } category.siteDocuments.Add(webSite); if (logger != null) { logger.log(category.path + " -> [" + webSite.domain + "] -> pages [" + webSite.documents.Count + "]"); } } } }