public SectionParser(CommandLineLogger logger, ExtractFileInfo fileInfo, bool isPreservation, string preservationPrefix, ParserVersionEnum version)
        {
            ContainsLocationData = false;
            Logger      = logger;
            FileInfo    = fileInfo;
            SourceFile  = FileInfo.OriginalFile_Path;
            SectionName = Path.GetFileNameWithoutExtension(FileInfo.File_Name);

            List <string> sectionStrings = new List <string>();

            if (SectionName.Contains("_"))
            {
                sectionStrings = SectionName.Split(new string[] { "_" }, StringSplitOptions.RemoveEmptyEntries).ToList();
            }
            else if (SectionName.Contains("_"))
            {
                sectionStrings = SectionName.Split(new string[] { "_" }, StringSplitOptions.RemoveEmptyEntries).ToList();
            }
            if (!sectionStrings.Any())
            {
                sectionStrings.Add(SectionName);
            }

            List <string> sectionStringsUpper = new List <string>();
            string        temp = string.Empty;

            foreach (string s in sectionStrings)
            {
                sectionStringsUpper.Add(s.First().ToString().ToUpper() + s.Substring(1));
            }

            IsPreservation     = isPreservation;
            PreservationPrefix = preservationPrefix;
            ParserVersion      = version;

            MainTableName = string.Join("", sectionStringsUpper);
            if (isPreservation)
            {
                MainTableName = PreservationPrefix + "_" + MainTableName;
            }
            DisplaySectionName = string.Join(" ", sectionStringsUpper);

            LoadHtml();
            ProcessHTML();
        }
 public VideosParser(CommandLineLogger logger, ExtractFileInfo fileInfo, bool isPreservation, string preservationPrefix, ParserVersionEnum version)
     : base(logger, fileInfo, isPreservation, preservationPrefix, version)
 {
 }
 public AboutMeParser(CommandLineLogger logger, ExtractFileInfo fileInfo, bool isPreservation, string preservationPrefix, ParserVersionEnum version)
     : base(logger, fileInfo, isPreservation, preservationPrefix, version)
 {
     if (isPreservation)
     {
         List <PreservationQuery> pq = new List <PreservationQuery>();
         pq.Add(new PreservationQuery()
         {
             PreservationTableName = MainTableName,
             QueryText             = string.Format(
                 @"SELECT 
                         (CASE WHEN a.[AboutMe] IS NULL THEN 'true' ELSE 'false' END) as [MissingInCurrent],
                         p.[AboutMe] as [P_AboutMe], 
                         a.[AboutMe],
                         (case when LTRIM(RTRIM(p.[AboutMe])) <> LTRIM(RTRIM(a.[AboutMe])) THEN 'true' ELSE 'false' END) AS [AboutMe_Changed],
                         a.[File]     
                     FROM {0} p 
                     LEFT JOIN {1} a ON a.[AboutMe] = p.[AboutMe]", MainTableName, MainTableName.Replace(preservationPrefix + "_", ""))
         });
         PreservationQueries = pq;
     }
 }
 public NcmecReportsParser(CommandLineLogger logger, ExtractFileInfo fileInfo, bool isPreservation, string preservationPrefix, ParserVersionEnum version)
     : base(logger, fileInfo, isPreservation, preservationPrefix, version)
 {
     throw new MissingTestDataException(DisplaySectionName);
 }
        public string ParseInstagramHTMLExtract(string extractZipFileNameAndPath)
        {
            Stopwatch totalProcessingTime = new Stopwatch();

            totalProcessingTime.Start();

            Stopwatch         stopWatch       = null;
            bool              hasPreservation = false;
            ParserVersionEnum version         = ParserVersionEnum.One;
            string            holderFolder    = Path.GetFileNameWithoutExtension(extractZipFileNameAndPath);
            CommandLineLogger logger          = new CommandLineLogger(holderFolder, _defaultDirectory);

            //pass case name to extract zip
            ZipUtility zp = new ZipUtility(logger)
            {
                CaseName = holderFolder
            };

            logger.LogInfo("Begin extracting files.");
            stopWatch = new Stopwatch();
            stopWatch.Start();
            UnzipFiles(logger, holderFolder, _defaultDirectory, null, extractZipFileNameAndPath, ref version);
            stopWatch.Stop();
            logger.LogInfo("Extract files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... ");

            string holderPath  = AppDomain.CurrentDomain.BaseDirectory + _defaultDirectory;
            string extractPath = holderPath + Path.GetFileNameWithoutExtension(holderFolder);

            ParseHTML(logger, version, holderFolder, ref hasPreservation);


            if (_allFiles.Count > 0)
            {
                DataAccess.AddSourceFiles(_defaultDirectory, _allFiles, holderFolder);
            }

            // TODO: Need to revisit this in the future to determine what analytics are requred or wanted by the user base.
            ////compare preservation to current tables if exists
            //if (hasPreservation)
            //{
            //    _log.LogInfo("Begin comparing preservation files.");
            //    DifPreservationTables(holderFolder);
            //}

            if (_tempFilesToRemove.Any())
            {
                logger.LogInfo("Begin removing temporary files.");
                stopWatch = new Stopwatch();
                stopWatch.Start();
                List <string> tempPathsToRemove = new List <string>();
                foreach (ExtractFileInfo tempFile in _tempFilesToRemove)
                {
                    logger.LogInfo("Removing file: " + tempFile.File_Path.Replace(extractPath + "\\", ""));
                    File.Delete(tempFile.File_Path);
                    if (!tempPathsToRemove.Contains(Path.GetDirectoryName(tempFile.File_Path)))
                    {
                        tempPathsToRemove.Add(Path.GetDirectoryName(tempFile.File_Path));
                    }
                }
                if (tempPathsToRemove.Any())
                {
                    foreach (string pathToRemove in tempPathsToRemove)
                    {
                        if (Directory.GetFiles(pathToRemove).Length == 0)
                        {
                            Directory.Delete(pathToRemove);
                        }
                    }
                }
                stopWatch.Stop();
                logger.LogInfo("Removing temporary files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... ");
            }
            if (_filesToPreserve.Any())
            {
                foreach (ExtractFileInfo file in _filesToPreserve)
                {
                    string preservedFileName = Path.GetDirectoryName(file.File_Path) + "\\save-" + file.File_Name;
                    if (File.Exists(preservedFileName))
                    {
                        if (!File.Exists(file.File_Path))
                        {
                            File.Copy(preservedFileName, file.File_Path);
                        }
                        File.Delete(Path.GetDirectoryName(file.File_Path) + "\\save-" + file.File_Name);
                    }
                }
            }

            totalProcessingTime.Stop();
            logger.LogInfoAlert("Processing Complete.  Total Processing time: " + totalProcessingTime.Elapsed.GetFormattedElapsedTime());
            return("");
        }
        private void ParseHTML(CommandLineLogger logger, ParserVersionEnum version, string caseName, ref bool hasPreservation)
        {
            Stopwatch stopWatch = new Stopwatch();

            logger.LogInfo("Begin parsing files.");
            stopWatch.Start();
            foreach (ExtractFileInfo htmlx in _htmlToParse)
            {
                ParserManager parser = new ParserManager(logger)
                {
                    CaseNumber     = caseName,
                    HtmlToLoad     = htmlx.File_Path,
                    HtmlToRecord   = htmlx.IsTemporary ? htmlx.ParentFile_Path : htmlx.File_Path,
                    IsPreservation = false,
                    DatabasePreservationNoPrefix = string.Empty,
                    Version          = version,
                    DefaultDirectory = _defaultDirectory
                };

                logger.LogInfo("Processing " + htmlx.File_Path);
                try
                {
                    if (htmlx.File_Path.ToLower().Contains("preservation"))
                    {
                        hasPreservation = true;
                        /*Preservation-1, Preservation-2, Preservation-3  */
                        parser.IsPreservation = true;
                        DirectoryInfo di = new DirectoryInfo(htmlx.File_Path);
                        string        p  = di.Parent.Name; //goes up to parent directory, preservation
                        if (!p.ToLower().Contains("preservation"))
                        {
                            p = di.Parent.Parent.Name;                             //goes up to parent directory, preservation\folderX\index.html
                        }
                        parser.DatabasePreservationNoPrefix = p.Replace("-", "_"); //sqllite doesn't like sql queries ref tables with a '-', change to '_'
                    }

                    if (!htmlx.File_Name.ToUpper().Contains("PRESERVATION"))
                    {
                        switch (htmlx.File_Name.ToUpper().Trim())
                        {
                        case "ABOUT_ME.HTML":
                            parser.AboutMeParse(htmlx);
                            break;

                        case "ACCOUNT_STATUS_HISTORY.HTML":
                            parser.AccountStatusHistoryParse(htmlx);
                            break;

                        case "COMMENTS.HTML":
                            parser.CommentsParse(htmlx);
                            break;

                        case "DEVICES.HTML":
                            parser.DevicesParse(htmlx);
                            break;

                        case "DIRECT_SHARES.HTML":
                            parser.DirectSharesParse(htmlx);
                            break;

                        case "DIRECT_STORIES.HTML":
                            parser.DirectStoriesParse(htmlx);
                            break;

                        case "FOLLOWERS.HTML":
                            parser.FollowersParse(htmlx);
                            break;

                        case "FOLLOWING.HTML":
                            parser.FollowingParse(htmlx);
                            break;

                        case "GENDER.HTML":
                            parser.GenderParse(htmlx);
                            break;

                        case "INCOMING_FOLLOW_REQUESTS.HTML":
                            parser.IncomingFollowRequestsParse(htmlx);
                            break;

                        case "INDEX.HTML":
                            parser.IndexParse(htmlx);
                            break;

                        case "LIKES.HTML":
                            parser.LikesParse(htmlx);
                            break;

                        case "LINKED_ACCOUNTS.HTML":
                            parser.LinkedAccountsParse(htmlx);
                            break;

                        case "LIVE_VIDEOS.HTML":
                            parser.LiveVideosParse(htmlx);
                            break;

                        case "UNIFIED_MESSAGES.HTML":
                            parser.UnifiedMessagesParse(htmlx);
                            break;

                        case "NAME_CHANGES.HTML":
                            parser.NameChangesParse(htmlx);
                            break;

                        case "NCMEC_REPORTS.HTML":
                            parser.NcmecReportsParse(htmlx);
                            break;

                        case "PHOTOS.HTML":
                            parser.PhotosParse(htmlx);
                            break;

                        case "POPULAR_BLOCK.HTML":
                            parser.PopularBlockParse(htmlx);
                            break;

                        case "PRIVACY_SETTINGS.HTML":
                            parser.PrivacySettingsParse(htmlx);
                            break;

                        case "PROFILE_PICTURE.HTML":
                            parser.ProfilePictureParse(htmlx);
                            break;

                        case "VANITY_CHANGES.HTML":
                            parser.VanityChangesParse(htmlx);
                            break;

                        case "VIDEOS.HTML":
                            parser.VideosParse(htmlx);
                            break;

                        case "WEBSITE.HTML":
                            parser.WebsiteParse(htmlx);
                            break;

                        default:
                            logger.LogWarning("Unknown Section - \"Unknown section:" + htmlx.File_Name + "\".  Please contact NDCAC with section name and test data to improve parsing functionality");
#if DEBUG
                            throw new ApplicationException(htmlx.File_Name);
#endif
                            break;
                        }
                    }
                    if (parser.LocationData != null && parser.LocationData.Any())
                    {
                        _locationData.AddRange(parser.LocationData);
                    }
                    AddSectionToPreservationList(htmlx.File_Name, parser.PreservationQueries);
                }
                catch (SectionEmptyException ex)
                {
                    logger.LogWarning("Parsing " + ex.Message + " section complete - section contains no data: Excluding from database.");
                }
                catch (MissingTestDataException ex)
                {
                    logger.LogWarning("Parsing " + ex.Message + " section skipped - parser not implemented: No test data available.");
                }
                catch (NotImplementedException ex)
                {
                    logger.LogError("Parsing " + ex.Message + " section failed: parser not implemented.", ex);
                }
            }
            WriteLocationData(caseName);
            stopWatch.Stop();
            logger.LogInfo("Parsing files complete (Time: " + stopWatch.Elapsed.GetFormattedElapsedTime() + ")... ");
        }
        private void UnzipFiles(CommandLineLogger logger, string holderFolder, string defaultDirectory, string parentZipFile, string toExtract, ref ParserVersionEnum version)
        {
            ZipUtility zp = new ZipUtility(logger)
            {
                CaseName          = holderFolder,
                ParentZipFilePath = parentZipFile
            };

            IEnumerable <ExtractFileInfo> htmlFiles = zp.ExtractZip(defaultDirectory, toExtract, ".html");

            if (htmlFiles.Any())
            {
                foreach (ExtractFileInfo file in htmlFiles)
                {
                    File.Copy(file.File_Path, Path.GetDirectoryName(file.File_Path) + "\\save-" + file.File_Name);
                    _filesToPreserve.Add(file);
                }

                _htmlToParse.AddRange(htmlFiles.Where(x => !x.File_Name.ToUpper().Trim().StartsWith("INDEX.") && !x.File_Name.ToUpper().Trim().StartsWith("PRESERVATION")));
                IEnumerable <ExtractFileInfo> generatedHtmlFiles = zp.GenerateParsedHtml(htmlFiles);
                if (generatedHtmlFiles.Any())
                {
                    version = ParserVersionEnum.Two;
                    foreach (ExtractFileInfo generatedHtmlFile in generatedHtmlFiles)
                    {
                        if (!_htmlToParse.Any(x => x.File_Path.Equals(generatedHtmlFile.File_Path)) &&
                            !_htmlToParse.Any(x => x.File_Path.Equals(generatedHtmlFile.File_Path + @"\" + Path.GetFileNameWithoutExtension(generatedHtmlFile.File_Path))))
                        {
                            _htmlToParse.Add(generatedHtmlFile);
                        }
                    }
                    _tempFilesToRemove.AddRange(generatedHtmlFiles);
                }
                else
                {
                    _htmlToParse.AddRange(htmlFiles.Where(x => x.File_Name.ToUpper().Trim().StartsWith("INDEX.")));
                }
                SaveFiles(htmlFiles);
            }

            IEnumerable <ExtractFileInfo> zipFiles = zp.ExtractZip(defaultDirectory, toExtract, ".zip");

            if (zipFiles.Count() > 0)
            {
                SaveFiles(zipFiles);
                _tempFilesToRemove.AddRange(zipFiles);
            }
            IEnumerable <ExtractFileInfo> otherFiles = zp.ExtractZip(defaultDirectory, toExtract, "");

            if (otherFiles.Count() > 0)
            {
                SaveFiles(otherFiles);
            }

            foreach (ExtractFileInfo z in zipFiles)
            {
                UnzipFiles(logger, holderFolder, defaultDirectory, toExtract, z.File_Path, ref version);
            }
        }
Ejemplo n.º 8
0
 public AccountStatusHistoryParser(CommandLineLogger logger, ExtractFileInfo fileInfo, bool isPreservation, string preservationPrefix, ParserVersionEnum version)
     : base(logger, fileInfo, isPreservation, preservationPrefix, version)
 {
 }