virtual public void test()
        {
            PdfReader    reader    = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "user10.pdf");
            Rectangle    rectangle = new Rectangle(71, 792 - 84, 225, 792 - 75);
            RenderFilter filter    = new RegionTextRenderFilter(rectangle);
            String       txt       = PdfTextExtractor.GetTextFromPage(reader, 1, new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter));

            Assert.AreEqual("Pname Dname Email Address", txt);
        }
Exemplo n.º 2
0
        private string[] ExtractCurrentColumnFromPage(Rectangle column, PdfReader reader, int page)
        {
            var renderFilter      = new RegionTextRenderFilter(column);
            var renderFilterArray = new RenderFilter[] { renderFilter };

            var filteredTextRenderListener = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilterArray);
            var textFromColumn             = PdfTextExtractor.GetTextFromPage(reader, page, filteredTextRenderListener);

            return(SplitColumnTextIntoRows(textFromColumn));
        }
Exemplo n.º 3
0
        public string GetStringValueFromRegion(string file, iTextSharp.text.Rectangle rectangle)
        {
            var reader       = new PdfReader(file);
            var renderFilter = new RenderFilter[1];

            renderFilter[0] = new RegionTextRenderFilter(rectangle);
            ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);

            return(PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategy));
        }
Exemplo n.º 4
0
        public static ITextExtractionStrategy MakeRectangle(float pixelDistanceFromLeft, float pixelDistanceFromBottom, float pixelDistanceWidth, float pixelDistanceHeight)
        {
            var rectangle = new System.util.RectangleJ(pixelDistanceFromLeft, pixelDistanceFromBottom, pixelDistanceWidth, pixelDistanceHeight);

            var filters = new RenderFilter[1];

            filters[0] = new RegionTextRenderFilter(rectangle);

            ITextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filters);

            return(strategy);
        }
Exemplo n.º 5
0
        private static string GetColumnText(PdfReader reader, int pageNum, float llx, float lly, float urx, float ury)
        {
            // reminder, parameters are in points, and 1 in = 2.54 cm = 72 points
            var rect         = new iTextSharp.text.Rectangle(llx, lly, urx, ury);
            var renderFilter = new RenderFilter[1];

            renderFilter[0] = new RegionTextRenderFilter(rect);
            var textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
            var text = PdfTextExtractor.GetTextFromPage(reader, pageNum, textExtractionStrategy);

            return(text);
        }
Exemplo n.º 6
0
        static void Main(string[] args)
        {
            //string strFile = @"D:\yoo\【着色算法】可根据不同着色技巧对图示进行重新染色.pdf";
            string strFile = @"D:\yoo\张甜甜-Java开发.pdf";

            PdfDocument doc    = new PdfDocument();
            PdfReader   reader = new PdfReader(strFile);

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                RenderFilter            fontFilter = new RegionTextRenderFilter(reader.GetPageSize(i));
                ITextExtractionStrategy strategy   = new TextWithFontExtractionStategy();
                string s = PdfTextExtractor.GetTextFromPage(reader, i, strategy);
            }
        }
Exemplo n.º 7
0
        private void ReadText()
        {
            //List<string> linestringlist = new List<string>();
            //PdfReader reader = new PdfReader(pdfFileName);
            iTextSharp.text.Rectangle rectA        = new iTextSharp.text.Rectangle(coordinate1a, coordinate2a, coordinate3a, coordinate4a);
            iTextSharp.text.Rectangle rectB        = new iTextSharp.text.Rectangle(coordinate1b, coordinate2b, coordinate3b, coordinate4b);
            RenderFilter[]            renderFilter = new RenderFilter[2];
            renderFilter[0] = new RegionTextRenderFilter(rectA);
            renderFilter[1] = new RegionTextRenderFilter(rectB);
            ITextExtractionStrategy textExtractionStrategyA = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[0]);
            ITextExtractionStrategy textExtractionStrategyB = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[1]);

            textA = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyA);
            textB = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyB);
        }
Exemplo n.º 8
0
        public static void ExtractIDs()
        {
            PdfReader               reader = new PdfReader($@"{jobDir}\PDF Extraction\temp\{sourcePDF}");
            FileStream              fs     = new FileStream($@"{jobDir}\PDF Extraction\temp\extractedIDs.txt", FileMode.Create);
            StreamWriter            sw     = new StreamWriter(fs);
            PdfReaderContentParser  parser = new PdfReaderContentParser(reader);
            ITextExtractionStrategy strategy;
            TextMarginFinder        finder;
            string previousVal = "";
            string currentVal  = "";
            int    count       = 0;
            string pages       = "";

            sw.WriteLine("Index\tID\tPageCounter\tPageNumber\tFileName");

            for (int i = 1; i <= reader.NumberOfPages; i++)
            {
                try
                {
                    // finder = parser.ProcessContent(i, new TextMarginFinder());
                    //Rectangle area = new Rectangle(finder.GetLlx(), finder.GetLly(), finder.GetWidth() / 2, finder.GetHeight() / 2);
                    Rectangle    area   = new Rectangle(414, 660, 522, 689);
                    RenderFilter filter = new RegionTextRenderFilter(area);
                    strategy   = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                    currentVal = PdfTextExtractor.GetTextFromPage(reader, i, strategy);

                    Rectangle    area2   = new Rectangle(465, 565, 555, 635);
                    RenderFilter filter2 = new RegionTextRenderFilter(area2);
                    strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter2);
                    pages    = PdfTextExtractor.GetTextFromPage(reader, i, strategy);


                    if (previousVal != currentVal)
                    {
                        count = 0;
                    }
                    count++;
                    previousVal = currentVal;
                    sw.WriteLine($"{i}\t{currentVal}\t{pages.Split('\n')[0]}\t{count}\t{currentVal}-{count}");
                }
                catch (Exception)
                {
                    sw.WriteLine($"{i}\tfailed");
                }
            }
            sw.Flush();
            sw.Close();
        }
        virtual public void Test()
        {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");

            String[] expectedText = new String[] {
                "PostScript Compatibility",
                "Because the PostScript language does not support the transparent imaging \n" +
                "model, PDF 1.4 consumer applications must have some means for converting the \n" +
                "appearance of a document that uses transparency to a purely opaque description \n" +
                "for printing on PostScript output devices. Similar techniques can also be used to \n" +
                "convert such documents to a form that can be correctly viewed by PDF 1.3 and \n" +
                "earlier consumers. ",
                "Otherwise, flatten the colors to some assumed device color space with pre-\n" +
                "determined calibration. In the generated PostScript output, paint the flattened \n" +
                "colors in a CIE-based color space having that calibration. "
            };

            Rectangle[] regions = new Rectangle[] {
                new Rectangle(90, 605, 220, 581),
                new Rectangle(80, 578, 450, 486), new Rectangle(103, 196, 460, 143)
            };

            RegionTextRenderFilter[] regionFilters = new RegionTextRenderFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new RegionTextRenderFilter(regions[i]);
            }


            MultiFilteredRenderListener listener = new MultiFilteredRenderListener();

            LocationTextExtractionStrategy[] extractionStrategies = new LocationTextExtractionStrategy[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                extractionStrategies[i] =
                    (LocationTextExtractionStrategy)
                    listener.AttachRenderListener(new LocationTextExtractionStrategy(), regionFilters[i]);
            }

            new PdfReaderContentParser(pdfReader).ProcessContent(1, listener);

            for (int i = 0; i < regions.Length; i++)
            {
                String actualText = extractionStrategies[i].GetResultantText();
                Assert.AreEqual(expectedText[i], actualText);
            }
        }
        /// <summary>
        /// Extracts text from PDF file. (only selectable text content from the pages, not OCR from images)
        /// </summary>
        /// <param name="filepath">input file path</param>
        /// <param name="zone">Rectangle which specifies the zone where the text is extracted from a page. if it's null, then the full page is processed.</param>
        /// <param name="pages">List of pages to extract data from. If null or first item is 0, all pages will be extracted.</param>
        /// <returns>a list of strings. one string from each page</returns>
        public static List <string> GetPdfTextFromPages(string filepath, RectangleF?zone = null, List <int> pages = null)
        {
            using (PdfReader reader = new PdfReader(filepath))
            {
                List <string> result = new List <string>();

                if (pages == null || pages.First() == 0)                        //then read all pages
                {
                    pages = Enumerable.Range(1, reader.NumberOfPages).ToList(); //create the list of all pagenumbers in the actual PDF
                }

                foreach (var i in pages)
                {
                    if (i > reader.NumberOfPages)
                    {
                        continue;
                    }

                    if (zone.HasValue)
                    { //zone based text extract
                        float x = Utilities.MillimetersToPoints(zone.Value.X);
                        float y = Utilities.MillimetersToPoints(zone.Value.Y);
                        float w = Utilities.MillimetersToPoints(zone.Value.Width);
                        float h = Utilities.MillimetersToPoints(zone.Value.Height);

                        var pagesize = reader.GetPageSizeWithRotation(i);
                        iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(x, pagesize.Top - y, x + w, pagesize.Top - y - h); //tanslate coordinates to iText

                        RenderFilter[] renderFilter = new RenderFilter[1];
                        renderFilter[0] = new RegionTextRenderFilter(rect);
                        ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                        string text = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy);
                        result.Add(text.Replace("\n", Environment.NewLine));
                    }
                    else
                    { //full page text extract
                        string text = PdfTextExtractor.GetTextFromPage(reader, i);
                        result.Add(text.Replace("\n", Environment.NewLine));
                    }
                }



                reader.Close();
                return(result);
            }
        }
Exemplo n.º 11
0
    void TakeDataFromPdf(UnityWebRequest www)
    {
        MemoryStream mm = new MemoryStream(www.downloadHandler.data);

        bytes = mm.ToArray();
        PdfReader reader = new PdfReader(mm);
        ITextExtractionStrategy strategy;

        for (int q = 0; q < 6; q++)
        {
            for (int j = 0; j < 8; j++)
            {
                Rectangle    rect   = new Rectangle(45 + (j * Adeltax), 40 + (q * Adeltay), 138 + (j * Bdeltax), 130 + (q * Bdeltay));
                RenderFilter filter = new RegionTextRenderFilter(rect);
                string       s;

                strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter);
                s        = "";
                s        = PdfTextExtractor.GetTextFromPage(reader, 1, strategy);

                //step1_____________________________________

                if (IsLaba(s))
                {
                    float microBdeltaX = 0;

                    if (j == 0)
                    {
                        microBdeltaX = Bdeltax;
                    }

                    Bdeltax *= 2;
                    rect     = new Rectangle(45 + (j * Adeltax), 40 + (q * Adeltay), 138 + (j * Bdeltax) + microBdeltaX, 130 + (q * Bdeltay));
                    filter   = new RegionTextRenderFilter(rect);
                    strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter);
                    s        = PdfTextExtractor.GetTextFromPage(reader, 1, strategy);

                    j++;
                    Bdeltax /= 2;
                }
                nakedData.Add(s);
            }
        }

        mm.Close();
    }
Exemplo n.º 12
0
        public string ExtractByCoordinate()
        {
            ITextExtractionStrategy strategy;

            Rectangle rectangle = new Rectangle(320, 785 - 250, 368, 799 - 250);
            // Rectangle rectangle = new Rectangle(447, 934-250, 678, 951); -> Ok
            RenderFilter filter = new RegionTextRenderFilter(rectangle);

            strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                Console.WriteLine(PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy));
            }

            return(null);
        }
Exemplo n.º 13
0
        public string ExtractData(float UIWith, float UIHeight, Point ll, Point ur,
                                  int page = 1)
        {
            Console.WriteLine("Test");
            float MultX = PageSize().Width / UIWith;
            float MultY = PageSize().Height / UIHeight;

            ITextExtractionStrategy strategy;

            Rectangle rectangle = new Rectangle(ll.X * MultX, (UIHeight - ll.Y) * MultY, ur.X * MultX, (UIHeight - ur.Y) * MultY);
            // Rectangle rectangle = new Rectangle(447, 934-250, 678, 951); // -> Ok
            RenderFilter filter = new RegionTextRenderFilter(rectangle);

            strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
            // return "" + UIWith + " " + UIHeight + " " + ll.X + " " + ll.Y + " " + ur.X + " " + ur.Y;

            return(PdfTextExtractor.GetTextFromPage(PdfReader, page, strategy));
        }
Exemplo n.º 14
0
 public static string ReadID(string fileName)
 {
     try
     {
         PdfReader reader = new PdfReader(fileName);
         PdfReaderContentParser  parser = new PdfReaderContentParser(reader);
         ITextExtractionStrategy strategy;
         //TextMarginFinder finder;
         //finder = parser.ProcessContent(1, new TextMarginFinder());
         Rectangle    area   = new Rectangle(414, 660, 522, 689);
         RenderFilter filter = new RegionTextRenderFilter(area);
         strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
         return(PdfTextExtractor.GetTextFromPage(reader, 1, strategy));
     }
     catch (Exception)
     {
         return("unreadable");
     }
 }
Exemplo n.º 15
0
        public string GetTextInArea(string filename, int page, int x, int y, int width, int height)
        {
            string text = string.Empty;

            using (PdfReader pdfReader = new PdfReader(filename))
            {
                RenderFilter filter   = new RegionTextRenderFilter(new System.util.RectangleJ(x, y, width, height));
                var          strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                //var strategy = new SimpleTextExtractionStrategy();

                text = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);

                text = Encoding.UTF8.GetString(ASCIIEncoding.Convert(
                                                   Encoding.Default, Encoding.UTF8,
                                                   Encoding.Default.GetBytes(text)));
            }

            return(text);
        }
Exemplo n.º 16
0
        public string getParagraphByCoOrdinate(string filepath, int pageno, int cordinate1, int coordinate2, int coordinate3, int coordinate4, bool filter)
        {
            PdfReader reader = new PdfReader(filepath);

            if (filter == false)
            {
                iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(cordinate1, 1000 - coordinate2, coordinate3, 1000 - coordinate4);
                RenderFilter[]            renderFilter = new RenderFilter[1];
                renderFilter[0] = new RegionTextRenderFilter(rect);
                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy);
                return(text);
            }
            else
            {
                iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(cordinate1, coordinate2, coordinate3, coordinate4);
                RenderFilter[]            renderFilter = new RenderFilter[1];
                renderFilter[0] = new RegionTextRenderFilter(rect);
                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy);
                return(text);
            }
        }
Exemplo n.º 17
0
    public bool TakeSomeCellOfData(int x)
    {
        int y = 0;

        x = EnterManager.instance.checkNumberOfLesson(x, ref y);

        MemoryStream            mm     = new MemoryStream(bytes);
        PdfReader               reader = new PdfReader(mm);
        ITextExtractionStrategy strategy;
        RenderFilter            filter;
        string s;


        Rectangle rect = new Rectangle(45 + (x * Adeltax), 40 + (y * Adeltay), 138 + (x * Bdeltax), 130 + (y * Bdeltay));

        filter   = new RegionTextRenderFilter(rect);
        strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter);
        s        = PdfTextExtractor.GetTextFromPage(reader, 1, strategy);
        if (s.Contains("семинар") || s.Contains("лекции"))
        {
            return(true);
        }
        return(false);
    }
Exemplo n.º 18
0
        public static void ExtractText(string fileName, string outFileName)
        {
            /* pour extraire seulement les 3 1ères pages
             * PdfReader pdfReader1 = new PdfReader("AN_PAI_12022018_1M.pdf");
             * Document document = new Document();
             *
             * PdfCopy copy = new PdfCopy(document, new FileStream("splitpaie1-3.pdf", FileMode.Create));
             * document.Open();
             * for (int page = 1; page <= 3; page++)
             * {
             *  document.NewPage();
             *  copy.AddPage(copy.GetImportedPage(pdfReader1, page));
             * }
             * document.Close();*/


            StreamWriter outFile  = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
            StreamWriter outFile2 = new StreamWriter("filtered " + outFileName, false, System.Text.Encoding.UTF8);

            PdfReader pdfReader = new PdfReader(fileName);

            for (int page = 1; page <= pdfReader.NumberOfPages; page++)
            {
                Rectangle psize = pdfReader.GetPageSize(page);

                ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                //ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();

                string id1, id2, netRegex, netLocation, idLocation;
                netRegex = id1 = id2 = "";
                string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
                outFile.Write(currentPageText);
                var match = Regex.Match(currentPageText, "Matricule : *([0-9]*)  SS : ([0-9]*)");
                if (match.Success)
                {
                    id1 = match.Groups[1].Value;
                    id2 = match.Groups[2].Value;
                    outFile2.Write("Matricule : " + id1 + "  SS : " + id2);
                }
                outFile2.Write(";");

                iTextSharp.text.Rectangle rect         = new iTextSharp.text.Rectangle(400, 842 - 128, 480, 848 - 110);
                RenderFilter[]            renderFilter = new RenderFilter[1];
                renderFilter[0] = new RegionTextRenderFilter(rect);
                ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                idLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy);
                outFile2.Write(";");
                outFile2.Write(idLocation);
                outFile2.Write(";");

                if ("Matricule : " + id1 + "  SS : " + id2 != idLocation.Trim())
                {
                    int h = 0;
                }

                //match = Regex.Match(currentPageText, "Pér *([0-9]*) *([0-9]*) *([0-9]*)");
                match = Regex.Match(currentPageText, "Net imposable(.*)Net imposable");
                if (match.Success)
                {
                    netRegex = match.Groups[1].Value;
                    outFile2.Write(netRegex);
                }

                rect                   = new iTextSharp.text.Rectangle(150, 240, 210, 250);
                renderFilter           = new RenderFilter[1];
                renderFilter[0]        = new RegionTextRenderFilter(rect);
                textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                netLocation            = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy);


                rect                   = new iTextSharp.text.Rectangle(400, 842 - 128, 480, 848 - 110);
                renderFilter           = new RenderFilter[1];
                renderFilter[0]        = new RegionTextRenderFilter(rect);
                textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategyEx(), renderFilter);
                string matriculeLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy);

                outFile2.Write(";");
                outFile2.Write(netLocation);

                if (netRegex.Trim() != netLocation.Trim())
                {
                    int x = 0;
                    //page 259, rappel période antérieure, pas de net imposable de période
                }

                if (netLocation == "")
                {
                    int z = 0;
                    //page 56, normal, sur plusieurs pages
                }

                outFile2.WriteLine();
            }

            pdfReader.Close();
            outFile.Close();
            outFile2.Close();
        }
Exemplo n.º 19
0
        private void Scrape(CancellationToken token, string path)
        {
            PdfReader reader = new PdfReader(path);
            string    sUri   = "";

            try
            {
                // Pagination
                for (var i = 1; i <= reader.NumberOfPages; i++)
                {
                    //get current page
                    var pageDict = reader.GetPageN(i);

                    //get all annotations from current page
                    var annotArray = (PdfArray)PdfReader.GetPdfObject(pageDict.Get(PdfName.ANNOTS));

                    //ensure array isn't empty
                    if (annotArray == null)
                    {
                        continue;
                    }
                    if (annotArray.Length <= 0)
                    {
                        continue;
                    }

                    // check every annotation on the page
                    foreach (var annot in annotArray.ArrayList)
                    {
                        //convert the iTextSharp-specific object to a generic PDF object
                        var annotDict = (PdfDictionary)PdfReader.GetPdfObject(annot);

                        //ensure the object isnt empty
                        if (annotDict == null)
                        {
                            continue;
                        }

                        //get the annotation subtype and ensure it is a link
                        var subtype = annotDict.Get(PdfName.SUBTYPE).ToString();
                        Log("Subtype: " + subtype);
                        if (subtype != "/Link")
                        {
                            continue;
                        }


                        //get the annotations ACTION
                        var linkDict = (PdfDictionary)annotDict.GetDirectObject(PdfName.A);
                        if (linkDict == null)
                        {
                            continue;
                        }

                        if (!linkDict.Get(PdfName.S).Equals(PdfName.GOTO))
                        {
                            //get the link from the annotation
                            sUri = linkDict.Get(PdfName.URI).ToString();
                            Log("URI: " + sUri);
                            if (String.IsNullOrEmpty(sUri))
                            {
                                continue;
                            }
                        }


                        //build the link address into a string
                        string linkTextBuilder;

                        //create a rectangle, define its paramteres, read the text under the rectangle (the anchor text for the link) and write it to the string
                        var                       LinkLocation   = annotDict.GetAsArray(PdfName.RECT);
                        List <string>             linestringlist = new List <string>();
                        iTextSharp.text.Rectangle rect           = new iTextSharp.text.Rectangle(((PdfNumber)LinkLocation[0]).FloatValue, ((PdfNumber)LinkLocation[1]).FloatValue, ((PdfNumber)LinkLocation[2]).FloatValue, ((PdfNumber)LinkLocation[3]).FloatValue);
                        RenderFilter[]            renderFilter   = new RenderFilter[1];
                        renderFilter[0] = new RegionTextRenderFilter(rect);
                        ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
                        linkTextBuilder = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy).Trim();
                        Log(linkTextBuilder);

                        String sUriHTTPSPrefix = sUri.Substring(0, 5);
                        String sUriWWWPrefix   = sUri.Substring(0, 3);

                        if (sUriHTTPSPrefix.Equals("https"))
                        {
                            Log("Prefix: " + sUriHTTPSPrefix);
                        }
                        else if (sUriWWWPrefix.Equals("www"))
                        {
                            Log("Prefix: " + sUriWWWPrefix);
                        }


                        if (sUri.Length.Equals(70) && sUriHTTPSPrefix.Equals("https"))
                        {
                            //instantiate the web request and response objects
                            WebRequest  httpReq  = WebRequest.Create(sUri);
                            WebResponse response = httpReq.GetResponse();

                            //check website response from request and save status to string
                            string webStatus = ((HttpWebResponse)response).StatusDescription.ToString();
                            Log("Webstatus: " + webStatus);

                            //check website response url from request and save url to string
                            string responseURL = ((HttpWebResponse)response).ResponseUri.ToString();
                            Log("Response URI: " + responseURL);

                            //split the response url string to just sku number after the "="
                            string webSite = responseURL.Split('=')[1];
                            Log("Response SKU: " + webSite);

                            //split the link harvested from the annotations in the pdf after the "="
                            string sku = sUri.Split('=')[1];
                            Log("PDF SKU: " + sku);

                            //truncate the split string to just the sku (removes any extra symbols, such as copywright, which were captured in the rectangle)
                            string finalSku = sku.Substring(0, 7);
                            Log("PDF SKU Final: " + finalSku);

                            //delete asteriks from sku
                            var deleteChars = new string[] { "*" };
                            foreach (var c in deleteChars)
                            {
                                linkTextBuilder = finalSku.Replace(c, string.Empty);
                            }

                            //truncate the split string to just the sku (removes any extra symbols, such as copywright, which were captured in the rectangle)
                            string linkText = linkTextBuilder.Substring(0, 7);
                            Log("Link SKU Final: " + linkText);

                            //default status of string match is "NO MATCH"
                            string match = "\tNO MATCH";

                            //make a blank IPEndPoint
                            IPEndPoint remoteEP = null;

                            //create a new httpwebrequest
                            HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(sUri);

                            //bind IPEndPoint from the remote end point to a variable
                            req.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount)
                            {
                                remoteEP = remoteEndPoint;
                                return(null);
                            };

                            //get the response from the request
                            req.GetResponse();
                            Log("HTTP response: " + req.GetResponse());

                            //set remoteEndPoint to string for display
                            string hostIP = remoteEP.Address.ToString();
                            Log("Host IP: " + hostIP);

                            //stop the request to make way for a new request on next iteration
                            req.Abort();

                            //if sku from PDF link, website response link, and pdf anchor text all match then change match
                            if (linkText.Equals(sku) && webSite.Equals(sku))
                            {
                                match = "MATCH";
                            }

                            //add data to datagridview
                            this.skuGrid.Rows.Add(i, sku, linkTextBuilder, webSite, match, webStatus, hostIP);
                            //close http request and response
                            response.Close();

                            if (token.IsCancellationRequested)
                            {
                                // Clean up here, then...
                                token.ThrowIfCancellationRequested();
                                Log("Cancellation token: " + token);
                                MessageBox.Show("Scrape stopped");
                                throbber.Hide();
                                skuGrid.Show();
                            }
                        }
                        else if (!sUri.Length.Equals(70) && sUriWWWPrefix.Equals("www") ||
                                 !sUri.Length.Equals(70) && sUriHTTPSPrefix.Equals("https"))
                        {
                            //instantiate the web request and response objects
                            WebRequest  httpReq  = WebRequest.Create(sUri);
                            WebResponse response = httpReq.GetResponse();

                            //check website response from request and save status to string
                            string webStatus = ((HttpWebResponse)response).StatusDescription.ToString();
                            Log("Web Status: " + webStatus);

                            //check website response url from request and save url to string
                            string responseURL = ((HttpWebResponse)response).ResponseUri.ToString();
                            Log("Response URI: " + responseURL);

                            //split the response url string to just sku number after the "="
                            string webSite = responseURL;
                            Log("Response SKU: " + webSite);

                            //default status of string match is "NO MATCH"
                            string match = "\tNO MATCH";

                            //make a blank IPEndPoint
                            IPEndPoint remoteEP = null;

                            //create a new httpwebrequest
                            HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(sUri);

                            //bind IPEndPoint from the remote end point to a variable
                            req.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount)
                            {
                                remoteEP = remoteEndPoint;
                                return(null);
                            };

                            //get the response from the request
                            req.GetResponse();
                            Log("HTTP Response: " + req.GetResponse());

                            //set remoteEndPoint to string for display
                            string hostIP = remoteEP.Address.ToString();
                            Log("Host IP: " + hostIP);

                            //stop the request to make way for a new request on next iteration
                            req.Abort();

                            //delete asteriks from sku
                            var deleteChars = new string[] { "*" };
                            foreach (var c in deleteChars)
                            {
                                linkTextBuilder = linkTextBuilder.Replace(c, string.Empty);
                            }

                            //if sku from PDF link, website response link, and pdf anchor text all match then change match
                            if (linkTextBuilder.Equals(webSite))
                            {
                                match = "MATCH";
                            }

                            //add data to datagridview
                            this.skuGrid.Rows.Add(i, null, linkTextBuilder, webSite, match, webStatus, hostIP);
                            //close http request and response
                            response.Close();

                            if (token.IsCancellationRequested)
                            {
                                // Clean up here, then...
                                token.ThrowIfCancellationRequested();
                                Log("Cancellation token: " + token);
                                MessageBox.Show("Scrape stopped");
                                throbber.Hide();
                                skuGrid.Show();
                            }
                        }
                    }
                }
                MessageBox.Show("Scrape complete");
                throbber.Hide();
                skuGrid.Show();
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
                Log("EXCEPTION ERROR: " + ex + " " + ex.HelpLink);
                throbber.Hide();
                skuGrid.Show();
            }
            finally
            {
                taskIsRunning = false;

                //update grid
                skuGrid.Update();

                //close PDF reader
                reader.Close();

                WriteLog(log);
            }
        }