Exemple #1
0
        public void BuildRecordList_WithData()
        {
            var testData = new Dictionary <uint, uint>
            {
                { 4, 8 },
                { 1, 2 },
                { 2, 4 },
            };

            var expected = new List <TagDataFrequencyRecord>
            {
                new TagDataFrequencyRecord(1, 2, 1.0 * 2 / 14),
                new TagDataFrequencyRecord(2, 4, 1.0 * 4 / 14),
                new TagDataFrequencyRecord(3, 0, 0.0),
                new TagDataFrequencyRecord(4, 8, 1.0 * 8 / 14),
            };

            List <TagDataFrequencyRecord> actual = TagDataFrequencyRecord.BuildRecordList(testData).ToList();

            Assert.AreEqual(expected, actual);
        }
Exemple #2
0
 public void Constructor_ThrowsArgumentException_OnInvalidArgs(uint wordLength, uint count, double relativeFrequencyInReport)
 {
     Assert.Throws <ArgumentException>(() => { var _ = new TagDataFrequencyRecord(wordLength, count, relativeFrequencyInReport); });
 }
Exemple #3
0
        public void BuildRecordList_Empty()
        {
            IEnumerable <TagDataFrequencyRecord> records = TagDataFrequencyRecord.BuildRecordList(new Dictionary <uint, uint>());

            Assert.AreEqual(Enumerable.Empty <TagDataFrequencyRecord>(), records);
        }
Exemple #4
0
        /// <summary>
        /// Writes each part of the report content separately by calling the relevant GetStreamForX methods in turn
        /// </summary>
        /// <param name="jobInfo"></param>
        private void WriteSplitReport(CompletedExtractJobInfo jobInfo)
        {
            // TODO(rkm 2020-10-29) We can probably reduce the number of full collection enumerations in this method

            using (Stream stream = GetStreamForSummary(jobInfo))
            {
                using StreamWriter streamWriter = GetStreamWriter(stream);
                foreach (string line in JobHeader(jobInfo))
                {
                    streamWriter.WriteLine(line);
                }

                streamWriter.WriteLine();
                streamWriter.WriteLine("Files included:");
                streamWriter.WriteLine("-   README.md (this file)");
                streamWriter.WriteLine("-   pixel_data_summary.csv");
                streamWriter.WriteLine("-   pixel_data_full.csv");
                streamWriter.WriteLine("-   pixel_data_word_length_frequencies.csv");
                streamWriter.WriteLine("-   tag_data_summary.csv");
                streamWriter.WriteLine("-   tag_data_full.csv");
                streamWriter.WriteLine();
                streamWriter.WriteLine("This file contents:");
                streamWriter.WriteLine("-   Blocked files");
                streamWriter.WriteLine("-   Anonymisation failures");

                streamWriter.WriteLine();
                streamWriter.WriteLine("## Blocked files");
                streamWriter.WriteLine();
                IOrderedEnumerable <ExtractionIdentifierRejectionInfo> orderedRejections = _jobStore
                                                                                           .GetCompletedJobRejections(jobInfo.ExtractionJobIdentifier)
                                                                                           .OrderByDescending(x => x.RejectionItems.Sum(y => y.Value));
                foreach (ExtractionIdentifierRejectionInfo extractionIdentifierRejectionInfo in orderedRejections)
                {
                    WriteJobRejections(streamWriter, extractionIdentifierRejectionInfo);
                }

                streamWriter.WriteLine();
                streamWriter.WriteLine("## Anonymisation failures");
                streamWriter.WriteLine();
                foreach (FileAnonFailureInfo fileAnonFailureInfo in _jobStore.GetCompletedJobAnonymisationFailures(
                             jobInfo.ExtractionJobIdentifier))
                {
                    WriteAnonFailure(streamWriter, fileAnonFailureInfo);
                }

                streamWriter.WriteLine();
                streamWriter.WriteLine("--- end of report ---");

                streamWriter.Flush();
                FinishReportPart(stream);
            }

            // Local helper function to write each CSV
            void WriteCsv <T>(Stream stream, IEnumerable <T> records) where T : IExtractionReportCsvRecord
            {
                using StreamWriter streamWriter = GetStreamWriter(stream);
                using var csvWriter             = new CsvWriter(streamWriter, _csvConfiguration);

                csvWriter.WriteHeader <T>();
                csvWriter.NextRecord();

                csvWriter.WriteRecords(records);

                streamWriter.Flush();
                FinishReportPart(stream);
            }

            // All validation failures for this job
            Dictionary <string, Dictionary <string, List <string> > > groupedFailures = GetJobVerificationFailures(jobInfo.ExtractionJobIdentifier);

            // First deal with the pixel data
            Dictionary <string, List <string> > pixelFailures = groupedFailures.GetValueOrDefault(PixelDataStr);

            if (pixelFailures == null)
            {
                Logger.Info($"No {PixelDataStr} failures found for the extraction job");
                pixelFailures = new Dictionary <string, List <string> >();
            }

            // Create records for the pixel reports
            List <TagDataSummaryCsvRecord> pixelSummaryRecords = TagDataSummaryCsvRecord.BuildRecordList(PixelDataStr, pixelFailures).ToList();
            var wordLengthCounts = new Dictionary <uint, uint>();

            foreach (TagDataSummaryCsvRecord tagDataSummaryCsvRecord in pixelSummaryRecords)
            {
                var wordLen = (uint)tagDataSummaryCsvRecord.FailureValue.Length;
                if (!wordLengthCounts.ContainsKey(wordLen))
                {
                    wordLengthCounts.Add(wordLen, 0);
                }
                wordLengthCounts[wordLen] += (uint)tagDataSummaryCsvRecord.Occurrences;
                tagDataSummaryCsvRecord.RelativeFrequencyInReport = tagDataSummaryCsvRecord.RelativeFrequencyInTag;
            }

            // Write summary pixel CSV
            using (Stream stream = GetStreamForPixelDataSummary(jobInfo))
                WriteCsv(
                    stream,
                    pixelSummaryRecords
                    .OrderByDescending(x => x.FailureValue.Length)
                    .ThenByDescending(x => x.Occurrences)
                    );

            // Write full pixel CSV
            using (Stream stream = GetStreamForPixelDataFull(jobInfo))
                WriteCsv(
                    stream,
                    TagDataFullCsvRecord
                    .BuildRecordList(PixelDataStr, pixelFailures)
                    .OrderByDescending(x => x.FailureValue.Length)
                    );

            // Write the pixel text frequency file
            using (Stream stream = GetStreamForPixelDataWordLengthFrequencies(jobInfo))
                WriteCsv(
                    stream,
                    TagDataFrequencyRecord.BuildRecordList(wordLengthCounts)
                    );

            // Now select all other tags
            Dictionary <string, Dictionary <string, List <string> > > otherTagFailures =
                groupedFailures
                .Where(x => x.Key != PixelDataStr)
                .ToDictionary(x => x.Key, x => x.Value);

            // Write the summary CSV for all other tags. Before doing so, we need to convert into records and calculate the relative frequencies
            var summaryRecordsByTag     = new List <List <TagDataSummaryCsvRecord> >();
            var totalOccurrencesByValue = new Dictionary <string, uint>();

            foreach ((string tagName, Dictionary <string, List <string> > failures) in otherTagFailures)
            {
                List <TagDataSummaryCsvRecord> record = TagDataSummaryCsvRecord.BuildRecordList(tagName, failures).ToList();
                summaryRecordsByTag.Add(record);
                foreach (TagDataSummaryCsvRecord r in record)
                {
                    if (!totalOccurrencesByValue.ContainsKey(r.FailureValue))
                    {
                        totalOccurrencesByValue[r.FailureValue] = 0;
                    }
                    totalOccurrencesByValue[r.FailureValue] += r.Occurrences;
                }
            }
            var totalFailureValues       = (uint)summaryRecordsByTag.Sum(x => x.Sum(y => y.Occurrences));
            var orderedTagSummaryRecords = new List <TagDataSummaryCsvRecord>();

            foreach (List <TagDataSummaryCsvRecord> tagRecordList in summaryRecordsByTag.OrderByDescending(x =>
                                                                                                           x.Sum(y => y.Occurrences)))
            {
                foreach (TagDataSummaryCsvRecord record in tagRecordList.OrderByDescending(x => x.Occurrences))
                {
                    record.RelativeFrequencyInReport = totalOccurrencesByValue[record.FailureValue] * 1.0 / totalFailureValues;
                    orderedTagSummaryRecords.Add(record);
                }
            }

            using (Stream stream = GetStreamForTagDataSummary(jobInfo))
                WriteCsv(
                    stream,
                    orderedTagSummaryRecords
                    );

            // Write the full csv for all other tags.
            var fullRecordsByTag = new List <List <TagDataFullCsvRecord> >();

            foreach ((string tagName, Dictionary <string, List <string> > failures) in otherTagFailures)
            {
                fullRecordsByTag.Add(TagDataFullCsvRecord.BuildRecordList(tagName, failures).ToList());
            }
            var orderedFullTagRecords = new List <TagDataFullCsvRecord>();

            foreach (IEnumerable <TagDataFullCsvRecord> tagRecordSet in fullRecordsByTag.OrderBy(x => x[0].TagName))
            {
                foreach (var x in tagRecordSet.OrderByDescending(x => x.FailureValue))
                {
                    orderedFullTagRecords.Add(x);
                }
            }

            using (Stream stream = GetStreamForTagDataFull(jobInfo))
                WriteCsv(
                    stream,
                    orderedFullTagRecords
                    );
        }