private void ParseHTML(CommandLineLogger logger, ParserVersionEnum version, string caseName, ref bool hasPreservation) { Stopwatch stopWatch = new Stopwatch(); logger.LogInfo("Begin parsing files."); stopWatch.Start(); foreach (ExtractFileInfo htmlx in _htmlToParse) { ParserManager parser = new ParserManager(logger) { CaseNumber = caseName, HtmlToLoad = htmlx.File_Path, HtmlToRecord = htmlx.IsTemporary ? htmlx.ParentFile_Path : htmlx.File_Path, IsPreservation = false, DatabasePreservationNoPrefix = string.Empty, Version = version, DefaultDirectory = _defaultDirectory }; logger.LogInfo("Processing " + htmlx.File_Path); try { if (htmlx.File_Path.ToLower().Contains("preservation")) { hasPreservation = true; /*Preservation-1, Preservation-2, Preservation-3 */ parser.IsPreservation = true; DirectoryInfo di = new DirectoryInfo(htmlx.File_Path); string p = di.Parent.Name; //goes up to parent directory, preservation if (!p.ToLower().Contains("preservation")) { p = di.Parent.Parent.Name; //goes up to parent directory, preservation\folderX\index.html } parser.DatabasePreservationNoPrefix = p.Replace("-", "_"); //sqllite doesn't like sql queries ref tables with a '-', change to '_' } if (!htmlx.File_Name.ToUpper().Contains("PRESERVATION")) { switch (htmlx.File_Name.ToUpper().Trim()) { case "ABOUT_ME.HTML": parser.AboutMeParse(htmlx); break; case "ACCOUNT_STATUS_HISTORY.HTML": parser.AccountStatusHistoryParse(htmlx); break; case "COMMENTS.HTML": parser.CommentsParse(htmlx); break; case "DEVICES.HTML": parser.DevicesParse(htmlx); break; case "DIRECT_SHARES.HTML": parser.DirectSharesParse(htmlx); break; case "DIRECT_STORIES.HTML": parser.DirectStoriesParse(htmlx); break; case "FOLLOWERS.HTML": parser.FollowersParse(htmlx); break; case "FOLLOWING.HTML": parser.FollowingParse(htmlx); break; case "GENDER.HTML": parser.GenderParse(htmlx); break; case "INCOMING_FOLLOW_REQUESTS.HTML": parser.IncomingFollowRequestsParse(htmlx); break; case "INDEX.HTML": parser.IndexParse(htmlx); break; case "LIKES.HTML": parser.LikesParse(htmlx); break; case "LINKED_ACCOUNTS.HTML": parser.LinkedAccountsParse(htmlx); break; case "LIVE_VIDEOS.HTML": parser.LiveVideosParse(htmlx); break; case "UNIFIED_MESSAGES.HTML": parser.UnifiedMessagesParse(htmlx); break; case "NAME_CHANGES.HTML": parser.NameChangesParse(htmlx); break; case "NCMEC_REPORTS.HTML": parser.NcmecReportsParse(htmlx); break; case "PHOTOS.HTML": parser.PhotosParse(htmlx); break; case "POPULAR_BLOCK.HTML": parser.PopularBlockParse(htmlx); break; case "PRIVACY_SETTINGS.HTML": parser.PrivacySettingsParse(htmlx); break; case "PROFILE_PICTURE.HTML": parser.ProfilePictureParse(htmlx); break; case "VANITY_CHANGES.HTML": parser.VanityChangesParse(htmlx); break; case "VIDEOS.HTML": parser.VideosParse(htmlx); break; case "WEBSITE.HTML": parser.WebsiteParse(htmlx); break; default: logger.LogWarning("Unknown Section - \"Unknown section:" + htmlx.File_Name + "\". Please contact NDCAC with section name and test data to improve parsing functionality"); #if DEBUG throw new ApplicationException(htmlx.File_Name); #endif break; } } if (parser.LocationData != null && parser.LocationData.Any()) { _locationData.AddRange(parser.LocationData); } AddSectionToPreservationList(htmlx.File_Name, parser.PreservationQueries); } catch (SectionEmptyException ex) { logger.LogWarning("Parsing " + ex.Message + " section complete - section contains no data: Excluding from database."); } catch (MissingTestDataException ex) { logger.LogWarning("Parsing " + ex.Message + " section skipped - parser not implemented: No test data available."); } catch (NotImplementedException ex) { logger.LogError("Parsing " + ex.Message + " section failed: parser not implemented.", ex); } } WriteLocationData(caseName); stopWatch.Stop(); logger.LogInfo("Parsing files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... "); }
public string ParseInstagramHTMLExtract(string extractZipFileNameAndPath) { Stopwatch totalProcessingTime = new Stopwatch(); totalProcessingTime.Start(); Stopwatch stopWatch = null; bool hasPreservation = false; ParserVersionEnum version = ParserVersionEnum.One; string holderFolder = Path.GetFileNameWithoutExtension(extractZipFileNameAndPath); CommandLineLogger logger = new CommandLineLogger(holderFolder, _defaultDirectory); //pass case name to extract zip ZipUtility zp = new ZipUtility(logger) { CaseName = holderFolder }; logger.LogInfo("Begin extracting files."); stopWatch = new Stopwatch(); stopWatch.Start(); UnzipFiles(logger, holderFolder, _defaultDirectory, null, extractZipFileNameAndPath, ref version); stopWatch.Stop(); logger.LogInfo("Extract files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... "); string holderPath = AppDomain.CurrentDomain.BaseDirectory + _defaultDirectory; string extractPath = holderPath + Path.GetFileNameWithoutExtension(holderFolder); ParseHTML(logger, version, holderFolder, ref hasPreservation); if (_allFiles.Count > 0) { DataAccess.AddSourceFiles(_defaultDirectory, _allFiles, holderFolder); } // TODO: Need to revisit this in the future to determine what analytics are requred or wanted by the user base. ////compare preservation to current tables if exists //if (hasPreservation) //{ // _log.LogInfo("Begin comparing preservation files."); // DifPreservationTables(holderFolder); //} if (_tempFilesToRemove.Any()) { logger.LogInfo("Begin removing temporary files."); stopWatch = new Stopwatch(); stopWatch.Start(); List <string> tempPathsToRemove = new List <string>(); foreach (ExtractFileInfo tempFile in _tempFilesToRemove) { logger.LogInfo("Removing file: " + tempFile.File_Path.Replace(extractPath + "\\", "")); File.Delete(tempFile.File_Path); if (!tempPathsToRemove.Contains(Path.GetDirectoryName(tempFile.File_Path))) { tempPathsToRemove.Add(Path.GetDirectoryName(tempFile.File_Path)); } } if (tempPathsToRemove.Any()) { foreach (string pathToRemove in tempPathsToRemove) { if (Directory.GetFiles(pathToRemove).Length == 0) { Directory.Delete(pathToRemove); } } } stopWatch.Stop(); logger.LogInfo("Removing temporary files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... "); } if (_filesToPreserve.Any()) { foreach (ExtractFileInfo file in _filesToPreserve) { string preservedFileName = Path.GetDirectoryName(file.File_Path) + "\\save-" + file.File_Name; if (File.Exists(preservedFileName)) { if (!File.Exists(file.File_Path)) { File.Copy(preservedFileName, file.File_Path); } File.Delete(Path.GetDirectoryName(file.File_Path) + "\\save-" + file.File_Name); } } } totalProcessingTime.Stop(); logger.LogInfoAlert("Processing Complete. Total Processing time: " + totalProcessingTime.Elapsed.GetFormattedElapsedTime()); return(""); }
public IEnumerable <ExtractFileInfo> ExtractZip(string defaultDirectory, string zipFilePath, string fileType, IProgress <ProgressValue> zipProgress = null) { string rootDataHolderPath = !string.IsNullOrEmpty(defaultDirectory) ? AppDomain.CurrentDomain.BaseDirectory + defaultDirectory : AppDomain.CurrentDomain.BaseDirectory + "Holder"; List <ExtractFileInfo> files = new List <ExtractFileInfo>(); string folderHolder = Path.GetFileNameWithoutExtension(zipFilePath); string extractTo = string.Empty; if (string.IsNullOrEmpty(ParentZipFilePath)) { extractTo = rootDataHolderPath + Path.GetFileNameWithoutExtension(zipFilePath); } else { extractTo = rootDataHolderPath + Path.GetFileNameWithoutExtension(ParentZipFilePath) + "\\" + Path.GetFileNameWithoutExtension(zipFilePath); } try { using (ZipArchive archive = ZipFile.OpenRead(zipFilePath)) { if (!Directory.Exists(extractTo)) { Directory.CreateDirectory(extractTo); _log.LogInfo("Created Directory: " + extractTo); } if (!_containerFolderLevels.HasValue) { if (archive.Entries.Any(x => x.FullName.ToUpper().Equals("INDEX.HTML"))) { _containerFolderLevels = 0; } else { IEnumerable <ZipArchiveEntry> indexEntries = archive.Entries.Where(x => x.Name.ToUpper().Equals("INDEX.HTML")); List <string[]> entries = new List <string[]>(); foreach (ZipArchiveEntry indexEntry in indexEntries) { entries.Add(indexEntry.FullName.Split('/')); } _containerFolderLevels = entries.Any() ? entries.Select(x => x.Length).Min() - 1 : (int?)null; } } int entryCount = 0; foreach (ZipArchiveEntry entry in archive.Entries) { if (string.IsNullOrEmpty(fileType)) { string[] spFile = entry.FullName.Split('/').Skip(_containerFolderLevels.HasValue ? _containerFolderLevels.Value : 0).ToArray(); string originalFile = string.Empty; string file = string.Empty; originalFile = string.Join("\\", spFile); if (spFile.Count() > 1 && spFile[spFile.Length - 1].ToUpper().Contains("INDEX")) { file = spFile[spFile.Length - 1].Replace("index", spFile[spFile.Length - 2]); } else { file = originalFile; } if (!file.StartsWith("._") && !file.EndsWith("\\")) { if (!Directory.Exists(Path.GetDirectoryName(Path.Combine(extractTo, file)))) { Directory.CreateDirectory(Path.GetDirectoryName(Path.Combine(extractTo, file))); _log.LogInfo("Created Directory: " + Path.GetDirectoryName(Path.Combine(extractTo, file))); } try { if (!File.Exists(Path.Combine(extractTo, file))) { entry.ExtractToFile(Path.Combine(extractTo, file)); _log.LogInfo("Created:" + Path.Combine(extractTo, file)); files.Add(new ExtractFileInfo() { File_Path = Path.Combine(extractTo, file), OriginalFile_Path = Path.Combine(extractTo, originalFile), Root_Path = extractTo }); } } catch { } } } else { if (entry.FullName.EndsWith(fileType, StringComparison.OrdinalIgnoreCase)) { string[] spFile = entry.FullName.Split('/').Skip(_containerFolderLevels.HasValue ? _containerFolderLevels.Value : 0).ToArray(); string originalFile = string.Empty; string file = string.Empty; originalFile = string.Join("\\", spFile); if (spFile.Count() > 1 && spFile[spFile.Length - 1].ToUpper().Contains("INDEX")) { file = spFile[spFile.Length - 1].Replace("index", spFile[spFile.Length - 2]); } else { file = originalFile; } if (!file.StartsWith("._") && !file.EndsWith("\\")) { if (!Directory.Exists(Path.GetDirectoryName(Path.Combine(extractTo, file)))) { Directory.CreateDirectory(Path.GetDirectoryName(Path.Combine(extractTo, file))); _log.LogInfo("Created Directory: " + Path.GetDirectoryName(Path.Combine(extractTo, file))); } try { if (!File.Exists(Path.Combine(extractTo, file))) { entry.ExtractToFile(Path.Combine(extractTo, file)); _log.LogInfo("Created:" + Path.Combine(extractTo, file)); files.Add(new ExtractFileInfo() { File_Path = Path.Combine(extractTo, file), OriginalFile_Path = Path.Combine(extractTo, originalFile), Root_Path = extractTo }); } } catch (Exception ep) { } } } } entryCount++; if (zipProgress != null) { int percent = (int)Math.Round(((double)entryCount / (double)archive.Entries.Count) * 100, 0); zipProgress.Report(new ProgressValue { Message = "Extracting " + Path.GetFileName(zipFilePath) + "... ", PercentComplete = percent }); } } } } catch (Exception e) { } return(files); }