コード例 #1
0
        private void ParseHTML(CommandLineLogger logger, ParserVersionEnum version, string caseName, ref bool hasPreservation)
        {
            Stopwatch stopWatch = new Stopwatch();

            logger.LogInfo("Begin parsing files.");
            stopWatch.Start();
            foreach (ExtractFileInfo htmlx in _htmlToParse)
            {
                ParserManager parser = new ParserManager(logger)
                {
                    CaseNumber     = caseName,
                    HtmlToLoad     = htmlx.File_Path,
                    HtmlToRecord   = htmlx.IsTemporary ? htmlx.ParentFile_Path : htmlx.File_Path,
                    IsPreservation = false,
                    DatabasePreservationNoPrefix = string.Empty,
                    Version          = version,
                    DefaultDirectory = _defaultDirectory
                };

                logger.LogInfo("Processing " + htmlx.File_Path);
                try
                {
                    if (htmlx.File_Path.ToLower().Contains("preservation"))
                    {
                        hasPreservation = true;
                        /*Preservation-1, Preservation-2, Preservation-3  */
                        parser.IsPreservation = true;
                        DirectoryInfo di = new DirectoryInfo(htmlx.File_Path);
                        string        p  = di.Parent.Name; //goes up to parent directory, preservation
                        if (!p.ToLower().Contains("preservation"))
                        {
                            p = di.Parent.Parent.Name;                             //goes up to parent directory, preservation\folderX\index.html
                        }
                        parser.DatabasePreservationNoPrefix = p.Replace("-", "_"); //sqllite doesn't like sql queries ref tables with a '-', change to '_'
                    }

                    if (!htmlx.File_Name.ToUpper().Contains("PRESERVATION"))
                    {
                        switch (htmlx.File_Name.ToUpper().Trim())
                        {
                        case "ABOUT_ME.HTML":
                            parser.AboutMeParse(htmlx);
                            break;

                        case "ACCOUNT_STATUS_HISTORY.HTML":
                            parser.AccountStatusHistoryParse(htmlx);
                            break;

                        case "COMMENTS.HTML":
                            parser.CommentsParse(htmlx);
                            break;

                        case "DEVICES.HTML":
                            parser.DevicesParse(htmlx);
                            break;

                        case "DIRECT_SHARES.HTML":
                            parser.DirectSharesParse(htmlx);
                            break;

                        case "DIRECT_STORIES.HTML":
                            parser.DirectStoriesParse(htmlx);
                            break;

                        case "FOLLOWERS.HTML":
                            parser.FollowersParse(htmlx);
                            break;

                        case "FOLLOWING.HTML":
                            parser.FollowingParse(htmlx);
                            break;

                        case "GENDER.HTML":
                            parser.GenderParse(htmlx);
                            break;

                        case "INCOMING_FOLLOW_REQUESTS.HTML":
                            parser.IncomingFollowRequestsParse(htmlx);
                            break;

                        case "INDEX.HTML":
                            parser.IndexParse(htmlx);
                            break;

                        case "LIKES.HTML":
                            parser.LikesParse(htmlx);
                            break;

                        case "LINKED_ACCOUNTS.HTML":
                            parser.LinkedAccountsParse(htmlx);
                            break;

                        case "LIVE_VIDEOS.HTML":
                            parser.LiveVideosParse(htmlx);
                            break;

                        case "UNIFIED_MESSAGES.HTML":
                            parser.UnifiedMessagesParse(htmlx);
                            break;

                        case "NAME_CHANGES.HTML":
                            parser.NameChangesParse(htmlx);
                            break;

                        case "NCMEC_REPORTS.HTML":
                            parser.NcmecReportsParse(htmlx);
                            break;

                        case "PHOTOS.HTML":
                            parser.PhotosParse(htmlx);
                            break;

                        case "POPULAR_BLOCK.HTML":
                            parser.PopularBlockParse(htmlx);
                            break;

                        case "PRIVACY_SETTINGS.HTML":
                            parser.PrivacySettingsParse(htmlx);
                            break;

                        case "PROFILE_PICTURE.HTML":
                            parser.ProfilePictureParse(htmlx);
                            break;

                        case "VANITY_CHANGES.HTML":
                            parser.VanityChangesParse(htmlx);
                            break;

                        case "VIDEOS.HTML":
                            parser.VideosParse(htmlx);
                            break;

                        case "WEBSITE.HTML":
                            parser.WebsiteParse(htmlx);
                            break;

                        default:
                            logger.LogWarning("Unknown Section - \"Unknown section:" + htmlx.File_Name + "\".  Please contact NDCAC with section name and test data to improve parsing functionality");
#if DEBUG
                            throw new ApplicationException(htmlx.File_Name);
#endif
                            break;
                        }
                    }
                    if (parser.LocationData != null && parser.LocationData.Any())
                    {
                        _locationData.AddRange(parser.LocationData);
                    }
                    AddSectionToPreservationList(htmlx.File_Name, parser.PreservationQueries);
                }
                catch (SectionEmptyException ex)
                {
                    logger.LogWarning("Parsing " + ex.Message + " section complete - section contains no data: Excluding from database.");
                }
                catch (MissingTestDataException ex)
                {
                    logger.LogWarning("Parsing " + ex.Message + " section skipped - parser not implemented: No test data available.");
                }
                catch (NotImplementedException ex)
                {
                    logger.LogError("Parsing " + ex.Message + " section failed: parser not implemented.", ex);
                }
            }
            WriteLocationData(caseName);
            stopWatch.Stop();
            logger.LogInfo("Parsing files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... ");
        }
コード例 #2
0
        public string ParseInstagramHTMLExtract(string extractZipFileNameAndPath)
        {
            Stopwatch totalProcessingTime = new Stopwatch();

            totalProcessingTime.Start();

            Stopwatch         stopWatch       = null;
            bool              hasPreservation = false;
            ParserVersionEnum version         = ParserVersionEnum.One;
            string            holderFolder    = Path.GetFileNameWithoutExtension(extractZipFileNameAndPath);
            CommandLineLogger logger          = new CommandLineLogger(holderFolder, _defaultDirectory);

            //pass case name to extract zip
            ZipUtility zp = new ZipUtility(logger)
            {
                CaseName = holderFolder
            };

            logger.LogInfo("Begin extracting files.");
            stopWatch = new Stopwatch();
            stopWatch.Start();
            UnzipFiles(logger, holderFolder, _defaultDirectory, null, extractZipFileNameAndPath, ref version);
            stopWatch.Stop();
            logger.LogInfo("Extract files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... ");

            string holderPath  = AppDomain.CurrentDomain.BaseDirectory + _defaultDirectory;
            string extractPath = holderPath + Path.GetFileNameWithoutExtension(holderFolder);

            ParseHTML(logger, version, holderFolder, ref hasPreservation);


            if (_allFiles.Count > 0)
            {
                DataAccess.AddSourceFiles(_defaultDirectory, _allFiles, holderFolder);
            }

            // TODO: Need to revisit this in the future to determine what analytics are requred or wanted by the user base.
            ////compare preservation to current tables if exists
            //if (hasPreservation)
            //{
            //    _log.LogInfo("Begin comparing preservation files.");
            //    DifPreservationTables(holderFolder);
            //}

            if (_tempFilesToRemove.Any())
            {
                logger.LogInfo("Begin removing temporary files.");
                stopWatch = new Stopwatch();
                stopWatch.Start();
                List <string> tempPathsToRemove = new List <string>();
                foreach (ExtractFileInfo tempFile in _tempFilesToRemove)
                {
                    logger.LogInfo("Removing file: " + tempFile.File_Path.Replace(extractPath + "\\", ""));
                    File.Delete(tempFile.File_Path);
                    if (!tempPathsToRemove.Contains(Path.GetDirectoryName(tempFile.File_Path)))
                    {
                        tempPathsToRemove.Add(Path.GetDirectoryName(tempFile.File_Path));
                    }
                }
                if (tempPathsToRemove.Any())
                {
                    foreach (string pathToRemove in tempPathsToRemove)
                    {
                        if (Directory.GetFiles(pathToRemove).Length == 0)
                        {
                            Directory.Delete(pathToRemove);
                        }
                    }
                }
                stopWatch.Stop();
                logger.LogInfo("Removing temporary files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... ");
            }
            if (_filesToPreserve.Any())
            {
                foreach (ExtractFileInfo file in _filesToPreserve)
                {
                    string preservedFileName = Path.GetDirectoryName(file.File_Path) + "\\save-" + file.File_Name;
                    if (File.Exists(preservedFileName))
                    {
                        if (!File.Exists(file.File_Path))
                        {
                            File.Copy(preservedFileName, file.File_Path);
                        }
                        File.Delete(Path.GetDirectoryName(file.File_Path) + "\\save-" + file.File_Name);
                    }
                }
            }

            totalProcessingTime.Stop();
            logger.LogInfoAlert("Processing Complete.  Total Processing time: " + totalProcessingTime.Elapsed.GetFormattedElapsedTime());
            return("");
        }
コード例 #3
0
        public IEnumerable <ExtractFileInfo> ExtractZip(string defaultDirectory, string zipFilePath, string fileType, IProgress <ProgressValue> zipProgress = null)
        {
            string rootDataHolderPath = !string.IsNullOrEmpty(defaultDirectory) ? AppDomain.CurrentDomain.BaseDirectory + defaultDirectory :
                                        AppDomain.CurrentDomain.BaseDirectory + "Holder";

            List <ExtractFileInfo> files = new List <ExtractFileInfo>();
            string folderHolder          = Path.GetFileNameWithoutExtension(zipFilePath);


            string extractTo = string.Empty;

            if (string.IsNullOrEmpty(ParentZipFilePath))
            {
                extractTo = rootDataHolderPath + Path.GetFileNameWithoutExtension(zipFilePath);
            }
            else
            {
                extractTo = rootDataHolderPath + Path.GetFileNameWithoutExtension(ParentZipFilePath) + "\\" + Path.GetFileNameWithoutExtension(zipFilePath);
            }

            try
            {
                using (ZipArchive archive = ZipFile.OpenRead(zipFilePath))
                {
                    if (!Directory.Exists(extractTo))
                    {
                        Directory.CreateDirectory(extractTo);
                        _log.LogInfo("Created Directory: " + extractTo);
                    }

                    if (!_containerFolderLevels.HasValue)
                    {
                        if (archive.Entries.Any(x => x.FullName.ToUpper().Equals("INDEX.HTML")))
                        {
                            _containerFolderLevels = 0;
                        }
                        else
                        {
                            IEnumerable <ZipArchiveEntry> indexEntries = archive.Entries.Where(x => x.Name.ToUpper().Equals("INDEX.HTML"));
                            List <string[]> entries = new List <string[]>();
                            foreach (ZipArchiveEntry indexEntry in indexEntries)
                            {
                                entries.Add(indexEntry.FullName.Split('/'));
                            }
                            _containerFolderLevels = entries.Any() ? entries.Select(x => x.Length).Min() - 1 : (int?)null;
                        }
                    }
                    int entryCount = 0;
                    foreach (ZipArchiveEntry entry in archive.Entries)
                    {
                        if (string.IsNullOrEmpty(fileType))
                        {
                            string[] spFile       = entry.FullName.Split('/').Skip(_containerFolderLevels.HasValue ? _containerFolderLevels.Value : 0).ToArray();
                            string   originalFile = string.Empty;
                            string   file         = string.Empty;

                            originalFile = string.Join("\\", spFile);
                            if (spFile.Count() > 1 && spFile[spFile.Length - 1].ToUpper().Contains("INDEX"))
                            {
                                file = spFile[spFile.Length - 1].Replace("index", spFile[spFile.Length - 2]);
                            }
                            else
                            {
                                file = originalFile;
                            }

                            if (!file.StartsWith("._") && !file.EndsWith("\\"))
                            {
                                if (!Directory.Exists(Path.GetDirectoryName(Path.Combine(extractTo, file))))
                                {
                                    Directory.CreateDirectory(Path.GetDirectoryName(Path.Combine(extractTo, file)));
                                    _log.LogInfo("Created Directory: " + Path.GetDirectoryName(Path.Combine(extractTo, file)));
                                }

                                try
                                {
                                    if (!File.Exists(Path.Combine(extractTo, file)))
                                    {
                                        entry.ExtractToFile(Path.Combine(extractTo, file));
                                        _log.LogInfo("Created:" + Path.Combine(extractTo, file));
                                        files.Add(new ExtractFileInfo()
                                        {
                                            File_Path = Path.Combine(extractTo, file), OriginalFile_Path = Path.Combine(extractTo, originalFile), Root_Path = extractTo
                                        });
                                    }
                                }
                                catch
                                {
                                }
                            }
                        }
                        else
                        {
                            if (entry.FullName.EndsWith(fileType, StringComparison.OrdinalIgnoreCase))
                            {
                                string[] spFile       = entry.FullName.Split('/').Skip(_containerFolderLevels.HasValue ? _containerFolderLevels.Value : 0).ToArray();
                                string   originalFile = string.Empty;
                                string   file         = string.Empty;

                                originalFile = string.Join("\\", spFile);
                                if (spFile.Count() > 1 && spFile[spFile.Length - 1].ToUpper().Contains("INDEX"))
                                {
                                    file = spFile[spFile.Length - 1].Replace("index", spFile[spFile.Length - 2]);
                                }
                                else
                                {
                                    file = originalFile;
                                }

                                if (!file.StartsWith("._") && !file.EndsWith("\\"))
                                {
                                    if (!Directory.Exists(Path.GetDirectoryName(Path.Combine(extractTo, file))))
                                    {
                                        Directory.CreateDirectory(Path.GetDirectoryName(Path.Combine(extractTo, file)));
                                        _log.LogInfo("Created Directory: " + Path.GetDirectoryName(Path.Combine(extractTo, file)));
                                    }
                                    try
                                    {
                                        if (!File.Exists(Path.Combine(extractTo, file)))
                                        {
                                            entry.ExtractToFile(Path.Combine(extractTo, file));
                                            _log.LogInfo("Created:" + Path.Combine(extractTo, file));
                                            files.Add(new ExtractFileInfo()
                                            {
                                                File_Path = Path.Combine(extractTo, file), OriginalFile_Path = Path.Combine(extractTo, originalFile), Root_Path = extractTo
                                            });
                                        }
                                    }
                                    catch (Exception ep)
                                    {
                                    }
                                }
                            }
                        }
                        entryCount++;
                        if (zipProgress != null)
                        {
                            int percent = (int)Math.Round(((double)entryCount / (double)archive.Entries.Count) * 100, 0);
                            zipProgress.Report(new ProgressValue {
                                Message = "Extracting " + Path.GetFileName(zipFilePath) + "... ", PercentComplete = percent
                            });
                        }
                    }
                }
            }
            catch (Exception e)
            {
            }

            return(files);
        }