internal void Stop() { UninterruptableMonitor.Enter(this); try { stopped = true; if (tuple != null) { tuple = null; UninterruptableMonitor.Pulse(this); //Notify(); } } finally { UninterruptableMonitor.Exit(this); } }
public void Run() { try { Sax.IXMLReader reader = new TagSoup.Parser(); //XMLReaderFactory.createXMLReader(); reader.ContentHandler = this; reader.ErrorHandler = this; while (!stopped) { Stream localFileIS = outerInstance.@is; if (localFileIS != null) { // null means fileIS was closed on us try { // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader. reader.Parse(new InputSource(IOUtils.GetDecodingReader(localFileIS, Encoding.UTF8))); } catch (Exception ioe) when(ioe.IsIOException()) { UninterruptableMonitor.Enter(outerInstance); try { if (localFileIS != outerInstance.@is) { // fileIS was closed on us, so, just fall through } else { // Exception is real throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details) } } finally { UninterruptableMonitor.Exit(outerInstance); } } } UninterruptableMonitor.Enter(this); try { if (stopped || !outerInstance.m_forever) { nmde = new NoMoreDataException(); UninterruptableMonitor.Pulse(this); //notify(); return; } else if (localFileIS == outerInstance.@is) { // If file is not already re-opened then re-open it now outerInstance.@is = outerInstance.OpenInputStream(); } } finally { UninterruptableMonitor.Exit(this); } } } catch (SAXException sae) { throw RuntimeException.Create(sae); } catch (Exception ioe) when(ioe.IsIOException()) { throw RuntimeException.Create(ioe); } finally { UninterruptableMonitor.Enter(this); try { threadDone = true; UninterruptableMonitor.Pulse(this); //Notify(); } finally { UninterruptableMonitor.Exit(this); } } }
public override void EndElement(string @namespace, string simple, string qualified) { int elemType = GetElementType(qualified); switch (elemType) { case PAGE: // the body must be null and we either are keeping image docs or the // title does not start with Image: if (body != null && (outerInstance.keepImages || !title.StartsWith("Image:", StringComparison.Ordinal))) { string[] tmpTuple = new string[LENGTH]; tmpTuple[TITLE] = title.Replace('\t', ' '); tmpTuple[DATE] = time.Replace('\t', ' '); tmpTuple[BODY] = Regex.Replace(body, "[\t\n]", " "); tmpTuple[ID] = id; UninterruptableMonitor.Enter(this); try { while (tuple != null && !stopped) { try { UninterruptableMonitor.Wait(this); //wait(); } catch (System.Threading.ThreadInterruptedException ie) { throw new Util.ThreadInterruptedException(ie); } } tuple = tmpTuple; UninterruptableMonitor.Pulse(this); //notify(); } finally { UninterruptableMonitor.Exit(this); } } break; case BODY: body = contents.ToString(); //workaround that startswith doesn't have an ignore case option, get at least 10 chars. string startsWith = body.Substring(0, Math.Min(10, contents.Length) - 0).ToLowerInvariant(); if (startsWith.StartsWith("#redirect", StringComparison.Ordinal)) { body = null; } break; case DATE: time = Time(contents.ToString()); break; case TITLE: title = contents.ToString(); break; case ID: //the doc id is the first one in the page. All other ids after that one can be ignored according to the schema if (id == null) { id = contents.ToString(); } break; default: // this element should be discarded. break; } }