Exemple #1
0
        public void TestGetDocumentMetaInfo_on_doc1()
        {
            string pdf_filename = MiscTestHelpers.GetNormalizedPathToAnyTestDataTestFile(@"fixtures/1.Doc-Many.Metadata.Formats/0001-LDA-paper/2004.04.PNAS.ef997ae1b01762b57b75d8c22fb8cec87406.pdf");

            ASSERT.FileExists(pdf_filename);
            PDFDocumentMuPDFMetaInfo info = MuPDFRenderer.GetDocumentMetaInfo(pdf_filename, null, ProcessPriorityClass.Normal);

            ASSERT.AreEqual <int>(8, info.PageCount);
            ASSERT.AreEqual <bool>(false, info.DocumentIsCorrupted);
            ASSERT.IsLessOrEqual(10000, info.raw_multipurp_text.Length);
            TestJSONoutputIsCorrectForPDFdoc1(info.raw_decoded_json);

            object json_doc  = JsonConvert.DeserializeObject(info.raw_multipurp_text);
            string json_text = JsonConvert.SerializeObject(json_doc, Formatting.Indented).Replace("\r\n", "\n");

            // Perform comparison via ApprovalTests->BeyondCompare (that's what I use for *decades* now)
            //ApprovalTests.Approvals.VerifyJson(json_out);   --> becomes the code below:
            ApprovalTests.Approvals.Verify(
                new QiqqaApprover(json_text, pdf_filename),
                ApprovalTests.Approvals.GetReporter()
                );

            info.ClearRawContent();
            ASSERT.IsNull(info.raw_multipurp_text);
            ASSERT.IsNull(info.raw_decoded_json);
        }
        public void Basic_Import_Test(string ris_filepath)
        {
            string path = GetNormalizedPathToRISTestFile(ris_filepath);

            ASSERT.FileExists(path);

            string ris_text = GetTestFileContent(path);

            Result rv = new Result();

            rv.lines_set = SplitMultipleRISLines(ris_text);
            foreach (List <string> lines in rv.lines_set)
            {
                RISRecord record = MapRISLinesToDictionary(lines);
                rv.records.Add(record);
                rv.bibtex_items.Add(record.ToBibTeX());
            }

            // Serialize the result to JSON for easier comparison via ApprovalTests->BeyondCompare (that's what I use for *decades* now)
            string json_out = JsonConvert.SerializeObject(rv, Newtonsoft.Json.Formatting.Indented).Replace("\r\n", "\n");

            //ApprovalTests.Approvals.VerifyJson(json_out);   --> becomes the code below:
            ApprovalTests.Approvals.Verify(
                new QiqqaApprover(json_out, ris_filepath),
                ApprovalTests.Approvals.GetReporter()
                );
        }
Exemple #3
0
        public void Test_SPEED(string filepath)
        {
            string path = GetNormalizedPathToAnyTestDataTestFile(filepath);

            ASSERT.FileExists(path);

            string data_in = GetTestFileContent(path);
            string s1      = data_in;
            string s2      = BibTexCharacterMap.ASCIIToBibTex(s1);
            string s3      = BibTexCharacterMap.BibTexToASCII(s2);

            const int NUM   = 1000000;
            const int CHUNK = 10000;
            Stopwatch start = Stopwatch.StartNew();
            int       i;

            for (i = 0; i < NUM; ++i)
            {
                if (i % CHUNK == CHUNK - 1)
                {
                    // don't run for more than 2 seconds
                    if (start.ElapsedMilliseconds >= 2000)
                    {
                        break;
                    }
                }
                s2 = BibTexCharacterMap.ASCIIToBibTex(s1);
            }
            double time_a2b = i * 1.0E-3 * s1.Length / start.ElapsedMilliseconds;

            Logging.Info("ASCIIToBibTex can do {0:0.000}M operations per second per character", time_a2b);

            start = Stopwatch.StartNew();
            for (i = 0; i < NUM; ++i)
            {
                if (i % CHUNK == CHUNK - 1)
                {
                    // don't run for more than 2 seconds
                    if (start.ElapsedMilliseconds >= 2000)
                    {
                        break;
                    }
                }
                s3 = BibTexCharacterMap.BibTexToASCII(s2);
            }
            double time_b2a = i * 1.0E-3 * s1.Length / start.ElapsedMilliseconds;

            Logging.Info("BibTexToASCII can do {0:0.000}M operations per second per character", time_b2a);

            // dummy
            BibTexCharacterMap.ASCIIToBibTex(s3);
        }
Exemple #4
0
        public void TestMuPDF_multipurp_JSON_formatted_snippet1_parses_okay()
        {
            string json_filename = MiscTestHelpers.GetNormalizedPathToAnyTestDataTestFile(@"fixtures/mutool/multipurp/json-snippets/pdf-info1-formatted.json");

            ASSERT.FileExists(json_filename);

            string json = File.ReadAllText(json_filename);

            ASSERT.IsNotNull(json);

            List <MultiPurpDocumentInfoObject> infos_list = JsonConvert.DeserializeObject <List <MultiPurpDocumentInfoObject> >(json);

            TestJSONoutputIsCorrectForPDFdoc1(infos_list);
        }
Exemple #5
0
        public void Deserialize_v80_ClassicalQiqqaRecord(string input_path)
        {
            ASSERT.IsTrue(true);

            string path = GetNormalizedPathToSerializationTestFile(input_path);

            ASSERT.FileExists(path);

            string input = GetTestFileContent(path);

            PDFDocument record = JsonConvert.DeserializeObject <PDFDocument>(input);

            ASSERT.IsTrue(true);
        }
Exemple #6
0
        public void Test_Conversion_To_BibTeX_Text(string filepath)
        {
            string path = GetNormalizedPathToAnyTestDataTestFile(filepath);

            ASSERT.FileExists(path);

            string data_in = GetTestFileContent(path);
            string s1      = data_in;
            string s2      = BibTexCharacterMap.ASCIIToBibTex(s1);

            ApprovalTests.Approvals.Verify(
                new QiqqaApprover(s2, filepath),
                ApprovalTests.Approvals.GetReporter()
                );
        }
Exemple #7
0
        public void Test_PDF_metadata_extraction_via_multipurp_chunk0070_qpdf(string filepath)
        {
            string pdf_filename = MiscTestHelpers.GetNormalizedPathToAnyTestDataTestFile($"fixtures/PDF/{ filepath.Replace("./", "") }");

            ASSERT.FileExists(pdf_filename);
            PDFDocumentMuPDFMetaInfo info = MuPDFRenderer.GetDocumentMetaInfo(pdf_filename, null, ProcessPriorityClass.Normal);

            string json_text = ProduceJSONtext4Comparison(info);

            // Perform comparison via ApprovalTests->BeyondCompare (that's what I use for *decades* now)
            //ApprovalTests.Approvals.VerifyJson(json_out);   --> becomes the code below:
            ApprovalTests.Approvals.Verify(
                new QiqqaApprover(json_text, pdf_filename),
                ApprovalTests.Approvals.GetReporter()
                );
        }
Exemple #8
0
        public void Basic_Test(string sample_filepath)
        {
            string path = GetNormalizedPathToAnyTestDataTestFile(sample_filepath);

            ASSERT.FileExists(path);

            string sample_text = GetTestFileContent(path);

            // extract URL from sample file:
            string url     = null;
            string docHtml = null;

            Match match = Regex.Match(sample_text, @"<!--(.*?)-->(.*)", RegexOptions.Singleline); // counter-intuitive flag: https://stackoverflow.com/questions/159118/how-do-i-match-any-character-across-multiple-lines-in-a-regular-expression

            if (Match.Empty != match)
            {
                url     = match.Groups[1].Value.Trim();
                docHtml = match.Groups[2].Value;
            }

            List <GoogleScholarScrapePaper> gssps = new List <GoogleScholarScrapePaper>();

            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(docHtml);
            //doc.Load(ms, System.Text.Encoding.UTF8, detectEncodingFromByteOrderMarks: false);

            ScrapeDoc(doc, url, ref gssps);

            ASSERT.IsGreaterOrEqual(gssps.Count, 1);

            // Serialize the result to JSON for easier comparison via ApprovalTests->BeyondCompare (that's what I use for *decades* now)
            string json_out = JsonConvert.SerializeObject(gssps, Newtonsoft.Json.Formatting.Indented).Replace("\r\n", "\n");

            //ApprovalTests.Approvals.VerifyJson(json_out);   --> becomes the code below:
            ApprovalTests.Approvals.Verify(
                new QiqqaApprover(json_out, sample_filepath),
                ApprovalTests.Approvals.GetReporter()
                );
        }
        public void GetFirstWord_Test(string data_filepath)
        {
            string path = GetNormalizedPathToAnyTestDataTestFile(data_filepath);

            ASSERT.FileExists(path);

            string data_text = GetTestFileContent(path);

            Result rv = new Result();

            rv.input  = data_text;
            rv.result = StringTools.GetFirstWord(data_text);

            // Serialize the result to JSON for easier comparison via ApprovalTests->BeyondCompare (that's what I use for *decades* now)
            string json_out = JsonConvert.SerializeObject(rv, Newtonsoft.Json.Formatting.Indented).Replace("\r\n", "\n");

            //ApprovalTests.Approvals.VerifyJson(json_out);   --> becomes the code below:
            ApprovalTests.Approvals.Verify(
                new QiqqaApprover(json_out, data_filepath),
                ApprovalTests.Approvals.GetReporter()
                );
        }
Exemple #10
0
        public void Basic_Import_Test(string pubmed_filepath)
        {
            // See http://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html for the low-down

            string path = GetNormalizedPathToPubMedXMLTestFile(pubmed_filepath);

            ASSERT.FileExists(path);

            string pubmed_xml = GetTestFileContent(path);

            Result rv = new Result();

            rv.success = PubMedXMLToBibTex.TryConvert(pubmed_xml, out rv.bibtex, out rv.messages);

            // Serialize the result to JSON for easier comparison via ApprovalTests->BeyondCompare (that's what I use for *decades* now)
            string json_out = JsonConvert.SerializeObject(rv, Newtonsoft.Json.Formatting.Indented).Replace("\r\n", "\n");

            //ApprovalTests.Approvals.VerifyJson(json_out);   --> becomes the code below:
            ApprovalTests.Approvals.Verify(
                new QiqqaApprover(json_out, pubmed_filepath),
                ApprovalTests.Approvals.GetReporter()
                );
        }