/////////////////////////////////////////////////////////////////////////////////////////////// private static void RemoveEndOfLine( byte[] buffer, /* in */ CharList endOfLine, /* in */ bool useAnyEndOfLineChar, /* in */ ref int bufferLength /* in, out */ ) { if ((buffer != null) && (endOfLine != null) && (bufferLength > 0)) { if (useAnyEndOfLineChar) { int bufferIndex = bufferLength - 1; while (bufferIndex >= 0) { if (endOfLine.Contains(ConversionOps.ToChar(buffer[bufferIndex]))) { bufferIndex--; } else { break; } } bufferLength = bufferIndex + 1; } else { int eolLength = endOfLine.Count; if (bufferLength >= eolLength) { bool match = true; int bufferIndex = bufferLength - eolLength; int eolIndex = 0; while ((bufferIndex < bufferLength) && (eolIndex < eolLength)) { if (buffer[bufferIndex] != ConversionOps.ToByte(endOfLine[eolIndex])) { match = false; break; } bufferIndex++; eolIndex++; } if (match) { bufferLength -= eolLength; } } } } }
private StringBuilder RemoveTrashXMLFast(StringBuilder text) { // Removing other if (text.ToString().Any(c => !CharList.Contains(c.ToString().ToLower()[0]))) { foreach (char c in text.ToString().Where(c => !CharList.Contains(c.ToString().ToLower()[0]))) { text.Replace(c.ToString(), " "); } } return(text); }
private string ModifyNGram(string ngram) { ngram = ngram.ToLower(); if (ngram.Length == 1) { if (!(CharList.Contains(ngram[0]) && char.IsLetter(ngram[0]))) { return(null); } return(ngram); } if (ngram.Length == 2) { string first = (CharList.Contains(ngram[0]) && char.IsLetter(ngram[0])) ? ngram[0].ToString() : "*"; string last = (CharList.Contains(ngram[ngram.Length - 1]) && char.IsLetter(ngram[ngram.Length - 1])) ? ngram[ngram.Length - 1].ToString() : "*"; if (first == "*" && last == "*") { return(null); } return(first + last); } else { string first = (CharList.Contains(ngram[0]) && char.IsLetter(ngram[0])) ? ngram[0].ToString() : "*"; string last = (CharList.Contains(ngram[ngram.Length - 1]) && char.IsLetter(ngram[ngram.Length - 1])) ? ngram[ngram.Length - 1].ToString() : "*"; string center = new string(ngram.Skip(1).Take(ngram.Length - 2).ToArray()); foreach (char c in center) { if (!CharList.Contains(c) || !char.IsLetter(c)) { return(null); } } return(first + center + last); } }
private StringBuilder RemoveTrashPlainText(StringBuilder text) { while (text.ToString().IndexOf('[') != -1) { int start_position = text.ToString().IndexOf('['); int end_position = text.ToString().IndexOf(']', start_position); if (end_position != -1) { text = text.Remove(start_position, end_position - start_position + 1); } else { break; } } while (text.ToString().IndexOf('<') != -1) { int start_position = text.ToString().IndexOf('<'); int end_position = text.ToString().IndexOf('>', start_position); if (end_position != -1) { text = text.Remove(start_position, end_position - start_position + 1); } else { break; } } text = text.Replace("==", ""); if (text.ToString().Any(c => !CharList.Contains(c.ToString().ToLower()[0]))) { foreach (char c in text.ToString().Where(c => !CharList.Contains(c.ToString().ToLower()[0]))) { text.Replace(c.ToString(), " "); } } return(text); }
private StringBuilder RemoveTrashXML(StringBuilder text) { // Removing ':{{', '{{{', '}}}', '}}}}', '== ==' text = text.Replace(":{{", "{{").Replace("{{{", "{{").Replace("}}}", "").Replace("}}}}", "}}"); // Removing 'wrapers' without depth List <Tuple <string, string> > wrapers = new List <Tuple <string, string> >() { new Tuple <string, string>("==", "=="), new Tuple <string, string>("=", "="), new Tuple <string, string>("<ref>", "</ref>"), new Tuple <string, string>("[[File:", ".]]"), new Tuple <string, string>("[[Image:", "]]"), new Tuple <string, string>("[[wikt:", "]]") }; foreach (Tuple <string, string> wraper in wrapers) { while (text.ToString().IndexOf(wraper.Item1) != -1) { int start_position = text.ToString().IndexOf(wraper.Item1); int end_position = text.ToString().IndexOf(wraper.Item2, start_position); int check_position = start_position; if (end_position != -1) { text = text.Remove(start_position, end_position - start_position + wraper.Item2.Length); } else { break; } } } // Removing 'wrapers' List <Tuple <string, string> > deep_wrapers = new List <Tuple <string, string> >() { new Tuple <string, string>("[[File:", "]]"), new Tuple <string, string>("{{", "}}"), new Tuple <string, string>("{|", "|}"), new Tuple <string, string>("(", ")") }; foreach (Tuple <string, string> wraper in deep_wrapers) { while (text.ToString().IndexOf(wraper.Item1) != -1) { int start_position = text.ToString().IndexOf(wraper.Item1); int end_position = text.ToString().IndexOf(wraper.Item2, start_position); int check_position = start_position; while (true) { check_position = text.ToString().IndexOf(wraper.Item1, check_position + 1); if (end_position > check_position && check_position != -1) { end_position = text.ToString().IndexOf(wraper.Item2, end_position + 1); } else { break; } } if (end_position != -1) { text = text.Remove(start_position, end_position - start_position + wraper.Item2.Length); } else { break; } } } // Removing '[[ ]]' while (text.ToString().IndexOf("[[") != -1) { int start_position = text.ToString().IndexOf("[["); int end_position = text.ToString().IndexOf("]]", start_position); int devider_position = text.ToString().IndexOf("|", start_position); if (end_position != -1) { if (devider_position > start_position && devider_position < end_position) { text = text.Remove(devider_position, end_position - devider_position + 2); text = text.Remove(start_position, 2); } else { text = text.Remove(end_position, 2); text = text.Remove(start_position, 2); } } else { break; } } // Removing '[ ]' while (text.ToString().IndexOf("[") != -1) { int start_position = text.ToString().IndexOf("["); int end_position = text.ToString().IndexOf("]", start_position); int check_position = start_position; if (end_position != -1) { text = text.Remove(start_position, end_position - start_position + 1); } else { break; } } // Removing other if (text.ToString().Any(c => !CharList.Contains(c.ToString().ToLower()[0]))) { foreach (char c in text.ToString().Where(c => !CharList.Contains(c.ToString().ToLower()[0]))) { text.Replace(c.ToString(), " "); } } return(text); }
/////////////////////////////////////////////////////////////////////////////////////////////// public ReturnCode Read( /* throw */ int count, CharList endOfLine, bool useAnyEndOfLineChar, ref ByteList list, ref Result error ) { CheckDisposed(); ReturnCode code = ReturnCode.Error; if (stream != null) { // // NOTE: Allocate enough for the whole file. // if (list == null) { long length = 0; // // NOTE: Only attempt to query the length of // seekable streams. // if (stream.CanSeek) { length = stream.Length; } if (length > 0) { list = new ByteList((int)Math.Min(length, MaximumBufferSize)); } else { list = new ByteList(); } } // // NOTE: Read from the stream in a loop until we hit a terminator // (typically "end of line" or "end of file"). // int readCount = 0; bool eolFound = false; int eolLength = (endOfLine != null) ? endOfLine.Count : 0; int eolIndex = 0; do { int value = ReadByte(stream); // // NOTE: Did we hit the end of the stream? // if (value != EndOfFile) { byte byteValue = ConversionOps.ToByte(value); // // NOTE: Did they supply a valid end-of-line sequence to check // against? // if ((endOfLine != null) && (eolLength > 0)) { // // NOTE: Does the caller want to stop reading as soon as any of // the supplied end-of-line characters are detected? // if (useAnyEndOfLineChar) { // // NOTE: Does the byte match any of the supplied end-of-line // characters? // if (endOfLine.Contains(ConversionOps.ToChar(byteValue))) { eolFound = true; } } else { // // NOTE: Does the byte we just read match the next character in // the end-of-line sequence we were expecting to see? // if (byteValue == endOfLine[eolIndex]) { // // NOTE: Have we just match the last character of the end-of-line // sequence? If so, we have found the end-of-line and we // are done. // if (++eolIndex == eolLength) { eolFound = true; /* NOTE: Hit end-of-line sequence. */ } } else if (eolIndex > 0) { // // NOTE: Any bytes previously matched against end-of-line sequence // characters no longer count because the end-of-line sequence // characters must appear consecutively. // eolIndex = 0; } } } // // NOTE: Add the byte (which could potentially be part of an end-of-line // sequence) to the buffer. // list.Add(byteValue); // // NOTE: We just read another byte, keep track. // readCount++; // // NOTE: Now that we have added the byte to the buffer, check to see if we // hit the end-of-line (above). If so, remove the end-of-line seuqnece // from the end of the buffer and bail out. // if (eolFound) { int bufferLength = list.Count; RemoveEndOfLine(list.ToArray(), endOfLine, useAnyEndOfLineChar, ref bufferLength); while (list.Count > bufferLength) { list.RemoveAt(list.Count - 1); } break; } } else { hitEndOfStream = true; /* NOTE: No more data. */ break; } }while ((count == Count.Invalid) || (readCount < count)); TranslateEndOfLine(StreamDirection.Input, list, ref list); // TEST: Test this. code = ReturnCode.Ok; } else { error = "invalid stream"; } return(code); }