virtual public void test() { PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "user10.pdf"); Rectangle rectangle = new Rectangle(71, 792 - 84, 225, 792 - 75); RenderFilter filter = new RegionTextRenderFilter(rectangle); String txt = PdfTextExtractor.GetTextFromPage(reader, 1, new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter)); Assert.AreEqual("Pname Dname Email Address", txt); }
private string[] ExtractCurrentColumnFromPage(Rectangle column, PdfReader reader, int page) { var renderFilter = new RegionTextRenderFilter(column); var renderFilterArray = new RenderFilter[] { renderFilter }; var filteredTextRenderListener = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilterArray); var textFromColumn = PdfTextExtractor.GetTextFromPage(reader, page, filteredTextRenderListener); return(SplitColumnTextIntoRows(textFromColumn)); }
public string GetStringValueFromRegion(string file, iTextSharp.text.Rectangle rectangle) { var reader = new PdfReader(file); var renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rectangle); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); return(PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategy)); }
public static ITextExtractionStrategy MakeRectangle(float pixelDistanceFromLeft, float pixelDistanceFromBottom, float pixelDistanceWidth, float pixelDistanceHeight) { var rectangle = new System.util.RectangleJ(pixelDistanceFromLeft, pixelDistanceFromBottom, pixelDistanceWidth, pixelDistanceHeight); var filters = new RenderFilter[1]; filters[0] = new RegionTextRenderFilter(rectangle); ITextExtractionStrategy strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filters); return(strategy); }
private static string GetColumnText(PdfReader reader, int pageNum, float llx, float lly, float urx, float ury) { // reminder, parameters are in points, and 1 in = 2.54 cm = 72 points var rect = new iTextSharp.text.Rectangle(llx, lly, urx, ury); var renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); var textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); var text = PdfTextExtractor.GetTextFromPage(reader, pageNum, textExtractionStrategy); return(text); }
static void Main(string[] args) { //string strFile = @"D:\yoo\【着色算法】可根据不同着色技巧对图示进行重新染色.pdf"; string strFile = @"D:\yoo\张甜甜-Java开发.pdf"; PdfDocument doc = new PdfDocument(); PdfReader reader = new PdfReader(strFile); for (int i = 1; i <= reader.NumberOfPages; i++) { RenderFilter fontFilter = new RegionTextRenderFilter(reader.GetPageSize(i)); ITextExtractionStrategy strategy = new TextWithFontExtractionStategy(); string s = PdfTextExtractor.GetTextFromPage(reader, i, strategy); } }
private void ReadText() { //List<string> linestringlist = new List<string>(); //PdfReader reader = new PdfReader(pdfFileName); iTextSharp.text.Rectangle rectA = new iTextSharp.text.Rectangle(coordinate1a, coordinate2a, coordinate3a, coordinate4a); iTextSharp.text.Rectangle rectB = new iTextSharp.text.Rectangle(coordinate1b, coordinate2b, coordinate3b, coordinate4b); RenderFilter[] renderFilter = new RenderFilter[2]; renderFilter[0] = new RegionTextRenderFilter(rectA); renderFilter[1] = new RegionTextRenderFilter(rectB); ITextExtractionStrategy textExtractionStrategyA = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[0]); ITextExtractionStrategy textExtractionStrategyB = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter[1]); textA = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyA); textB = PdfTextExtractor.GetTextFromPage(reader, 1, textExtractionStrategyB); }
public static void ExtractIDs() { PdfReader reader = new PdfReader($@"{jobDir}\PDF Extraction\temp\{sourcePDF}"); FileStream fs = new FileStream($@"{jobDir}\PDF Extraction\temp\extractedIDs.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); PdfReaderContentParser parser = new PdfReaderContentParser(reader); ITextExtractionStrategy strategy; TextMarginFinder finder; string previousVal = ""; string currentVal = ""; int count = 0; string pages = ""; sw.WriteLine("Index\tID\tPageCounter\tPageNumber\tFileName"); for (int i = 1; i <= reader.NumberOfPages; i++) { try { // finder = parser.ProcessContent(i, new TextMarginFinder()); //Rectangle area = new Rectangle(finder.GetLlx(), finder.GetLly(), finder.GetWidth() / 2, finder.GetHeight() / 2); Rectangle area = new Rectangle(414, 660, 522, 689); RenderFilter filter = new RegionTextRenderFilter(area); strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); currentVal = PdfTextExtractor.GetTextFromPage(reader, i, strategy); Rectangle area2 = new Rectangle(465, 565, 555, 635); RenderFilter filter2 = new RegionTextRenderFilter(area2); strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter2); pages = PdfTextExtractor.GetTextFromPage(reader, i, strategy); if (previousVal != currentVal) { count = 0; } count++; previousVal = currentVal; sw.WriteLine($"{i}\t{currentVal}\t{pages.Split('\n')[0]}\t{count}\t{currentVal}-{count}"); } catch (Exception) { sw.WriteLine($"{i}\tfailed"); } } sw.Flush(); sw.Close(); }
virtual public void Test() { PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf"); String[] expectedText = new String[] { "PostScript Compatibility", "Because the PostScript language does not support the transparent imaging \n" + "model, PDF 1.4 consumer applications must have some means for converting the \n" + "appearance of a document that uses transparency to a purely opaque description \n" + "for printing on PostScript output devices. Similar techniques can also be used to \n" + "convert such documents to a form that can be correctly viewed by PDF 1.3 and \n" + "earlier consumers. ", "Otherwise, flatten the colors to some assumed device color space with pre-\n" + "determined calibration. In the generated PostScript output, paint the flattened \n" + "colors in a CIE-based color space having that calibration. " }; Rectangle[] regions = new Rectangle[] { new Rectangle(90, 605, 220, 581), new Rectangle(80, 578, 450, 486), new Rectangle(103, 196, 460, 143) }; RegionTextRenderFilter[] regionFilters = new RegionTextRenderFilter[regions.Length]; for (int i = 0; i < regions.Length; i++) { regionFilters[i] = new RegionTextRenderFilter(regions[i]); } MultiFilteredRenderListener listener = new MultiFilteredRenderListener(); LocationTextExtractionStrategy[] extractionStrategies = new LocationTextExtractionStrategy[regions.Length]; for (int i = 0; i < regions.Length; i++) { extractionStrategies[i] = (LocationTextExtractionStrategy) listener.AttachRenderListener(new LocationTextExtractionStrategy(), regionFilters[i]); } new PdfReaderContentParser(pdfReader).ProcessContent(1, listener); for (int i = 0; i < regions.Length; i++) { String actualText = extractionStrategies[i].GetResultantText(); Assert.AreEqual(expectedText[i], actualText); } }
/// <summary> /// Extracts text from PDF file. (only selectable text content from the pages, not OCR from images) /// </summary> /// <param name="filepath">input file path</param> /// <param name="zone">Rectangle which specifies the zone where the text is extracted from a page. if it's null, then the full page is processed.</param> /// <param name="pages">List of pages to extract data from. If null or first item is 0, all pages will be extracted.</param> /// <returns>a list of strings. one string from each page</returns> public static List <string> GetPdfTextFromPages(string filepath, RectangleF?zone = null, List <int> pages = null) { using (PdfReader reader = new PdfReader(filepath)) { List <string> result = new List <string>(); if (pages == null || pages.First() == 0) //then read all pages { pages = Enumerable.Range(1, reader.NumberOfPages).ToList(); //create the list of all pagenumbers in the actual PDF } foreach (var i in pages) { if (i > reader.NumberOfPages) { continue; } if (zone.HasValue) { //zone based text extract float x = Utilities.MillimetersToPoints(zone.Value.X); float y = Utilities.MillimetersToPoints(zone.Value.Y); float w = Utilities.MillimetersToPoints(zone.Value.Width); float h = Utilities.MillimetersToPoints(zone.Value.Height); var pagesize = reader.GetPageSizeWithRotation(i); iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(x, pagesize.Top - y, x + w, pagesize.Top - y - h); //tanslate coordinates to iText RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); string text = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy); result.Add(text.Replace("\n", Environment.NewLine)); } else { //full page text extract string text = PdfTextExtractor.GetTextFromPage(reader, i); result.Add(text.Replace("\n", Environment.NewLine)); } } reader.Close(); return(result); } }
void TakeDataFromPdf(UnityWebRequest www) { MemoryStream mm = new MemoryStream(www.downloadHandler.data); bytes = mm.ToArray(); PdfReader reader = new PdfReader(mm); ITextExtractionStrategy strategy; for (int q = 0; q < 6; q++) { for (int j = 0; j < 8; j++) { Rectangle rect = new Rectangle(45 + (j * Adeltax), 40 + (q * Adeltay), 138 + (j * Bdeltax), 130 + (q * Bdeltay)); RenderFilter filter = new RegionTextRenderFilter(rect); string s; strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter); s = ""; s = PdfTextExtractor.GetTextFromPage(reader, 1, strategy); //step1_____________________________________ if (IsLaba(s)) { float microBdeltaX = 0; if (j == 0) { microBdeltaX = Bdeltax; } Bdeltax *= 2; rect = new Rectangle(45 + (j * Adeltax), 40 + (q * Adeltay), 138 + (j * Bdeltax) + microBdeltaX, 130 + (q * Bdeltay)); filter = new RegionTextRenderFilter(rect); strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter); s = PdfTextExtractor.GetTextFromPage(reader, 1, strategy); j++; Bdeltax /= 2; } nakedData.Add(s); } } mm.Close(); }
public string ExtractByCoordinate() { ITextExtractionStrategy strategy; Rectangle rectangle = new Rectangle(320, 785 - 250, 368, 799 - 250); // Rectangle rectangle = new Rectangle(447, 934-250, 678, 951); -> Ok RenderFilter filter = new RegionTextRenderFilter(rectangle); strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { Console.WriteLine(PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy)); } return(null); }
public string ExtractData(float UIWith, float UIHeight, Point ll, Point ur, int page = 1) { Console.WriteLine("Test"); float MultX = PageSize().Width / UIWith; float MultY = PageSize().Height / UIHeight; ITextExtractionStrategy strategy; Rectangle rectangle = new Rectangle(ll.X * MultX, (UIHeight - ll.Y) * MultY, ur.X * MultX, (UIHeight - ur.Y) * MultY); // Rectangle rectangle = new Rectangle(447, 934-250, 678, 951); // -> Ok RenderFilter filter = new RegionTextRenderFilter(rectangle); strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); // return "" + UIWith + " " + UIHeight + " " + ll.X + " " + ll.Y + " " + ur.X + " " + ur.Y; return(PdfTextExtractor.GetTextFromPage(PdfReader, page, strategy)); }
public static string ReadID(string fileName) { try { PdfReader reader = new PdfReader(fileName); PdfReaderContentParser parser = new PdfReaderContentParser(reader); ITextExtractionStrategy strategy; //TextMarginFinder finder; //finder = parser.ProcessContent(1, new TextMarginFinder()); Rectangle area = new Rectangle(414, 660, 522, 689); RenderFilter filter = new RegionTextRenderFilter(area); strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); return(PdfTextExtractor.GetTextFromPage(reader, 1, strategy)); } catch (Exception) { return("unreadable"); } }
public string GetTextInArea(string filename, int page, int x, int y, int width, int height) { string text = string.Empty; using (PdfReader pdfReader = new PdfReader(filename)) { RenderFilter filter = new RegionTextRenderFilter(new System.util.RectangleJ(x, y, width, height)); var strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); //var strategy = new SimpleTextExtractionStrategy(); text = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); text = Encoding.UTF8.GetString(ASCIIEncoding.Convert( Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text))); } return(text); }
public string getParagraphByCoOrdinate(string filepath, int pageno, int cordinate1, int coordinate2, int coordinate3, int coordinate4, bool filter) { PdfReader reader = new PdfReader(filepath); if (filter == false) { iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(cordinate1, 1000 - coordinate2, coordinate3, 1000 - coordinate4); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy); return(text); } else { iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(cordinate1, coordinate2, coordinate3, coordinate4); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); string text = PdfTextExtractor.GetTextFromPage(reader, pageno, textExtractionStrategy); return(text); } }
public bool TakeSomeCellOfData(int x) { int y = 0; x = EnterManager.instance.checkNumberOfLesson(x, ref y); MemoryStream mm = new MemoryStream(bytes); PdfReader reader = new PdfReader(mm); ITextExtractionStrategy strategy; RenderFilter filter; string s; Rectangle rect = new Rectangle(45 + (x * Adeltax), 40 + (y * Adeltay), 138 + (x * Bdeltax), 130 + (y * Bdeltay)); filter = new RegionTextRenderFilter(rect); strategy = new FilteredTextRenderListener(new SimpleTextExtractionStrategy(), filter); s = PdfTextExtractor.GetTextFromPage(reader, 1, strategy); if (s.Contains("семинар") || s.Contains("лекции")) { return(true); } return(false); }
public static void ExtractText(string fileName, string outFileName) { /* pour extraire seulement les 3 1ères pages * PdfReader pdfReader1 = new PdfReader("AN_PAI_12022018_1M.pdf"); * Document document = new Document(); * * PdfCopy copy = new PdfCopy(document, new FileStream("splitpaie1-3.pdf", FileMode.Create)); * document.Open(); * for (int page = 1; page <= 3; page++) * { * document.NewPage(); * copy.AddPage(copy.GetImportedPage(pdfReader1, page)); * } * document.Close();*/ StreamWriter outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8); StreamWriter outFile2 = new StreamWriter("filtered " + outFileName, false, System.Text.Encoding.UTF8); PdfReader pdfReader = new PdfReader(fileName); for (int page = 1; page <= pdfReader.NumberOfPages; page++) { Rectangle psize = pdfReader.GetPageSize(page); ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); //ITextExtractionStrategy strategy = new LocationTextExtractionStrategy(); string id1, id2, netRegex, netLocation, idLocation; netRegex = id1 = id2 = ""; string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy); outFile.Write(currentPageText); var match = Regex.Match(currentPageText, "Matricule : *([0-9]*) SS : ([0-9]*)"); if (match.Success) { id1 = match.Groups[1].Value; id2 = match.Groups[2].Value; outFile2.Write("Matricule : " + id1 + " SS : " + id2); } outFile2.Write(";"); iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(400, 842 - 128, 480, 848 - 110); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); idLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy); outFile2.Write(";"); outFile2.Write(idLocation); outFile2.Write(";"); if ("Matricule : " + id1 + " SS : " + id2 != idLocation.Trim()) { int h = 0; } //match = Regex.Match(currentPageText, "Pér *([0-9]*) *([0-9]*) *([0-9]*)"); match = Regex.Match(currentPageText, "Net imposable(.*)Net imposable"); if (match.Success) { netRegex = match.Groups[1].Value; outFile2.Write(netRegex); } rect = new iTextSharp.text.Rectangle(150, 240, 210, 250); renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); netLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy); rect = new iTextSharp.text.Rectangle(400, 842 - 128, 480, 848 - 110); renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategyEx(), renderFilter); string matriculeLocation = PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy); outFile2.Write(";"); outFile2.Write(netLocation); if (netRegex.Trim() != netLocation.Trim()) { int x = 0; //page 259, rappel période antérieure, pas de net imposable de période } if (netLocation == "") { int z = 0; //page 56, normal, sur plusieurs pages } outFile2.WriteLine(); } pdfReader.Close(); outFile.Close(); outFile2.Close(); }
private void Scrape(CancellationToken token, string path) { PdfReader reader = new PdfReader(path); string sUri = ""; try { // Pagination for (var i = 1; i <= reader.NumberOfPages; i++) { //get current page var pageDict = reader.GetPageN(i); //get all annotations from current page var annotArray = (PdfArray)PdfReader.GetPdfObject(pageDict.Get(PdfName.ANNOTS)); //ensure array isn't empty if (annotArray == null) { continue; } if (annotArray.Length <= 0) { continue; } // check every annotation on the page foreach (var annot in annotArray.ArrayList) { //convert the iTextSharp-specific object to a generic PDF object var annotDict = (PdfDictionary)PdfReader.GetPdfObject(annot); //ensure the object isnt empty if (annotDict == null) { continue; } //get the annotation subtype and ensure it is a link var subtype = annotDict.Get(PdfName.SUBTYPE).ToString(); Log("Subtype: " + subtype); if (subtype != "/Link") { continue; } //get the annotations ACTION var linkDict = (PdfDictionary)annotDict.GetDirectObject(PdfName.A); if (linkDict == null) { continue; } if (!linkDict.Get(PdfName.S).Equals(PdfName.GOTO)) { //get the link from the annotation sUri = linkDict.Get(PdfName.URI).ToString(); Log("URI: " + sUri); if (String.IsNullOrEmpty(sUri)) { continue; } } //build the link address into a string string linkTextBuilder; //create a rectangle, define its paramteres, read the text under the rectangle (the anchor text for the link) and write it to the string var LinkLocation = annotDict.GetAsArray(PdfName.RECT); List <string> linestringlist = new List <string>(); iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(((PdfNumber)LinkLocation[0]).FloatValue, ((PdfNumber)LinkLocation[1]).FloatValue, ((PdfNumber)LinkLocation[2]).FloatValue, ((PdfNumber)LinkLocation[3]).FloatValue); RenderFilter[] renderFilter = new RenderFilter[1]; renderFilter[0] = new RegionTextRenderFilter(rect); ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter); linkTextBuilder = PdfTextExtractor.GetTextFromPage(reader, i, textExtractionStrategy).Trim(); Log(linkTextBuilder); String sUriHTTPSPrefix = sUri.Substring(0, 5); String sUriWWWPrefix = sUri.Substring(0, 3); if (sUriHTTPSPrefix.Equals("https")) { Log("Prefix: " + sUriHTTPSPrefix); } else if (sUriWWWPrefix.Equals("www")) { Log("Prefix: " + sUriWWWPrefix); } if (sUri.Length.Equals(70) && sUriHTTPSPrefix.Equals("https")) { //instantiate the web request and response objects WebRequest httpReq = WebRequest.Create(sUri); WebResponse response = httpReq.GetResponse(); //check website response from request and save status to string string webStatus = ((HttpWebResponse)response).StatusDescription.ToString(); Log("Webstatus: " + webStatus); //check website response url from request and save url to string string responseURL = ((HttpWebResponse)response).ResponseUri.ToString(); Log("Response URI: " + responseURL); //split the response url string to just sku number after the "=" string webSite = responseURL.Split('=')[1]; Log("Response SKU: " + webSite); //split the link harvested from the annotations in the pdf after the "=" string sku = sUri.Split('=')[1]; Log("PDF SKU: " + sku); //truncate the split string to just the sku (removes any extra symbols, such as copywright, which were captured in the rectangle) string finalSku = sku.Substring(0, 7); Log("PDF SKU Final: " + finalSku); //delete asteriks from sku var deleteChars = new string[] { "*" }; foreach (var c in deleteChars) { linkTextBuilder = finalSku.Replace(c, string.Empty); } //truncate the split string to just the sku (removes any extra symbols, such as copywright, which were captured in the rectangle) string linkText = linkTextBuilder.Substring(0, 7); Log("Link SKU Final: " + linkText); //default status of string match is "NO MATCH" string match = "\tNO MATCH"; //make a blank IPEndPoint IPEndPoint remoteEP = null; //create a new httpwebrequest HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(sUri); //bind IPEndPoint from the remote end point to a variable req.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount) { remoteEP = remoteEndPoint; return(null); }; //get the response from the request req.GetResponse(); Log("HTTP response: " + req.GetResponse()); //set remoteEndPoint to string for display string hostIP = remoteEP.Address.ToString(); Log("Host IP: " + hostIP); //stop the request to make way for a new request on next iteration req.Abort(); //if sku from PDF link, website response link, and pdf anchor text all match then change match if (linkText.Equals(sku) && webSite.Equals(sku)) { match = "MATCH"; } //add data to datagridview this.skuGrid.Rows.Add(i, sku, linkTextBuilder, webSite, match, webStatus, hostIP); //close http request and response response.Close(); if (token.IsCancellationRequested) { // Clean up here, then... token.ThrowIfCancellationRequested(); Log("Cancellation token: " + token); MessageBox.Show("Scrape stopped"); throbber.Hide(); skuGrid.Show(); } } else if (!sUri.Length.Equals(70) && sUriWWWPrefix.Equals("www") || !sUri.Length.Equals(70) && sUriHTTPSPrefix.Equals("https")) { //instantiate the web request and response objects WebRequest httpReq = WebRequest.Create(sUri); WebResponse response = httpReq.GetResponse(); //check website response from request and save status to string string webStatus = ((HttpWebResponse)response).StatusDescription.ToString(); Log("Web Status: " + webStatus); //check website response url from request and save url to string string responseURL = ((HttpWebResponse)response).ResponseUri.ToString(); Log("Response URI: " + responseURL); //split the response url string to just sku number after the "=" string webSite = responseURL; Log("Response SKU: " + webSite); //default status of string match is "NO MATCH" string match = "\tNO MATCH"; //make a blank IPEndPoint IPEndPoint remoteEP = null; //create a new httpwebrequest HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(sUri); //bind IPEndPoint from the remote end point to a variable req.ServicePoint.BindIPEndPointDelegate = delegate(ServicePoint servicePoint, IPEndPoint remoteEndPoint, int retryCount) { remoteEP = remoteEndPoint; return(null); }; //get the response from the request req.GetResponse(); Log("HTTP Response: " + req.GetResponse()); //set remoteEndPoint to string for display string hostIP = remoteEP.Address.ToString(); Log("Host IP: " + hostIP); //stop the request to make way for a new request on next iteration req.Abort(); //delete asteriks from sku var deleteChars = new string[] { "*" }; foreach (var c in deleteChars) { linkTextBuilder = linkTextBuilder.Replace(c, string.Empty); } //if sku from PDF link, website response link, and pdf anchor text all match then change match if (linkTextBuilder.Equals(webSite)) { match = "MATCH"; } //add data to datagridview this.skuGrid.Rows.Add(i, null, linkTextBuilder, webSite, match, webStatus, hostIP); //close http request and response response.Close(); if (token.IsCancellationRequested) { // Clean up here, then... token.ThrowIfCancellationRequested(); Log("Cancellation token: " + token); MessageBox.Show("Scrape stopped"); throbber.Hide(); skuGrid.Show(); } } } } MessageBox.Show("Scrape complete"); throbber.Hide(); skuGrid.Show(); } catch (Exception ex) { MessageBox.Show(ex.Message); Log("EXCEPTION ERROR: " + ex + " " + ex.HelpLink); throbber.Hide(); skuGrid.Show(); } finally { taskIsRunning = false; //update grid skuGrid.Update(); //close PDF reader reader.Close(); WriteLog(log); } }