Пример #1
0
        ///////////////////////////////////////////////////////////////////////////////////////////////

        private static void RemoveEndOfLine(
            byte[] buffer,            /* in */
            CharList endOfLine,       /* in */
            bool useAnyEndOfLineChar, /* in */
            ref int bufferLength      /* in, out */
            )
        {
            if ((buffer != null) && (endOfLine != null) && (bufferLength > 0))
            {
                if (useAnyEndOfLineChar)
                {
                    int bufferIndex = bufferLength - 1;

                    while (bufferIndex >= 0)
                    {
                        if (endOfLine.Contains(ConversionOps.ToChar(buffer[bufferIndex])))
                        {
                            bufferIndex--;
                        }
                        else
                        {
                            break;
                        }
                    }

                    bufferLength = bufferIndex + 1;
                }
                else
                {
                    int eolLength = endOfLine.Count;

                    if (bufferLength >= eolLength)
                    {
                        bool match       = true;
                        int  bufferIndex = bufferLength - eolLength;
                        int  eolIndex    = 0;

                        while ((bufferIndex < bufferLength) &&
                               (eolIndex < eolLength))
                        {
                            if (buffer[bufferIndex] != ConversionOps.ToByte(endOfLine[eolIndex]))
                            {
                                match = false;
                                break;
                            }

                            bufferIndex++;
                            eolIndex++;
                        }

                        if (match)
                        {
                            bufferLength -= eolLength;
                        }
                    }
                }
            }
        }
        private StringBuilder RemoveTrashXMLFast(StringBuilder text)
        {
            // Removing other
            if (text.ToString().Any(c => !CharList.Contains(c.ToString().ToLower()[0])))
            {
                foreach (char c in text.ToString().Where(c => !CharList.Contains(c.ToString().ToLower()[0])))
                {
                    text.Replace(c.ToString(), " ");
                }
            }

            return(text);
        }
        private string ModifyNGram(string ngram)
        {
            ngram = ngram.ToLower();

            if (ngram.Length == 1)
            {
                if (!(CharList.Contains(ngram[0]) && char.IsLetter(ngram[0])))
                {
                    return(null);
                }

                return(ngram);
            }

            if (ngram.Length == 2)
            {
                string first = (CharList.Contains(ngram[0]) && char.IsLetter(ngram[0])) ? ngram[0].ToString() : "*";
                string last  = (CharList.Contains(ngram[ngram.Length - 1]) && char.IsLetter(ngram[ngram.Length - 1])) ? ngram[ngram.Length - 1].ToString() : "*";

                if (first == "*" && last == "*")
                {
                    return(null);
                }

                return(first + last);
            }
            else
            {
                string first  = (CharList.Contains(ngram[0]) && char.IsLetter(ngram[0])) ? ngram[0].ToString() : "*";
                string last   = (CharList.Contains(ngram[ngram.Length - 1]) && char.IsLetter(ngram[ngram.Length - 1])) ? ngram[ngram.Length - 1].ToString() : "*";
                string center = new string(ngram.Skip(1).Take(ngram.Length - 2).ToArray());

                foreach (char c in center)
                {
                    if (!CharList.Contains(c) || !char.IsLetter(c))
                    {
                        return(null);
                    }
                }

                return(first + center + last);
            }
        }
        private StringBuilder RemoveTrashPlainText(StringBuilder text)
        {
            while (text.ToString().IndexOf('[') != -1)
            {
                int start_position = text.ToString().IndexOf('[');
                int end_position   = text.ToString().IndexOf(']', start_position);
                if (end_position != -1)
                {
                    text = text.Remove(start_position, end_position - start_position + 1);
                }
                else
                {
                    break;
                }
            }

            while (text.ToString().IndexOf('<') != -1)
            {
                int start_position = text.ToString().IndexOf('<');
                int end_position   = text.ToString().IndexOf('>', start_position);
                if (end_position != -1)
                {
                    text = text.Remove(start_position, end_position - start_position + 1);
                }
                else
                {
                    break;
                }
            }

            text = text.Replace("==", "");

            if (text.ToString().Any(c => !CharList.Contains(c.ToString().ToLower()[0])))
            {
                foreach (char c in text.ToString().Where(c => !CharList.Contains(c.ToString().ToLower()[0])))
                {
                    text.Replace(c.ToString(), " ");
                }
            }

            return(text);
        }
        private StringBuilder RemoveTrashXML(StringBuilder text)
        {
            // Removing ':{{', '{{{', '}}}', '}}}}', '== =='
            text = text.Replace(":{{", "{{").Replace("{{{", "{{").Replace("}}}", "").Replace("}}}}", "}}");

            // Removing 'wrapers' without depth
            List <Tuple <string, string> > wrapers = new List <Tuple <string, string> >()
            {
                new Tuple <string, string>("==", "=="),
                new Tuple <string, string>("=", "="),
                new Tuple <string, string>("<ref>", "</ref>"),
                new Tuple <string, string>("[[File:", ".]]"),
                new Tuple <string, string>("[[Image:", "]]"),
                new Tuple <string, string>("[[wikt:", "]]")
            };

            foreach (Tuple <string, string> wraper in wrapers)
            {
                while (text.ToString().IndexOf(wraper.Item1) != -1)
                {
                    int start_position = text.ToString().IndexOf(wraper.Item1);
                    int end_position   = text.ToString().IndexOf(wraper.Item2, start_position);
                    int check_position = start_position;

                    if (end_position != -1)
                    {
                        text = text.Remove(start_position, end_position - start_position + wraper.Item2.Length);
                    }
                    else
                    {
                        break;
                    }
                }
            }

            // Removing 'wrapers'
            List <Tuple <string, string> > deep_wrapers = new List <Tuple <string, string> >()
            {
                new Tuple <string, string>("[[File:", "]]"),
                new Tuple <string, string>("{{", "}}"),
                new Tuple <string, string>("{|", "|}"),
                new Tuple <string, string>("(", ")")
            };

            foreach (Tuple <string, string> wraper in deep_wrapers)
            {
                while (text.ToString().IndexOf(wraper.Item1) != -1)
                {
                    int start_position = text.ToString().IndexOf(wraper.Item1);
                    int end_position   = text.ToString().IndexOf(wraper.Item2, start_position);
                    int check_position = start_position;
                    while (true)
                    {
                        check_position = text.ToString().IndexOf(wraper.Item1, check_position + 1);
                        if (end_position > check_position && check_position != -1)
                        {
                            end_position = text.ToString().IndexOf(wraper.Item2, end_position + 1);
                        }
                        else
                        {
                            break;
                        }
                    }

                    if (end_position != -1)
                    {
                        text = text.Remove(start_position, end_position - start_position + wraper.Item2.Length);
                    }
                    else
                    {
                        break;
                    }
                }
            }

            // Removing '[[ ]]'
            while (text.ToString().IndexOf("[[") != -1)
            {
                int start_position   = text.ToString().IndexOf("[[");
                int end_position     = text.ToString().IndexOf("]]", start_position);
                int devider_position = text.ToString().IndexOf("|", start_position);
                if (end_position != -1)
                {
                    if (devider_position > start_position && devider_position < end_position)
                    {
                        text = text.Remove(devider_position, end_position - devider_position + 2);
                        text = text.Remove(start_position, 2);
                    }
                    else
                    {
                        text = text.Remove(end_position, 2);
                        text = text.Remove(start_position, 2);
                    }
                }
                else
                {
                    break;
                }
            }

            // Removing '[ ]'
            while (text.ToString().IndexOf("[") != -1)
            {
                int start_position = text.ToString().IndexOf("[");
                int end_position   = text.ToString().IndexOf("]", start_position);
                int check_position = start_position;

                if (end_position != -1)
                {
                    text = text.Remove(start_position, end_position - start_position + 1);
                }
                else
                {
                    break;
                }
            }

            // Removing other
            if (text.ToString().Any(c => !CharList.Contains(c.ToString().ToLower()[0])))
            {
                foreach (char c in text.ToString().Where(c => !CharList.Contains(c.ToString().ToLower()[0])))
                {
                    text.Replace(c.ToString(), " ");
                }
            }

            return(text);
        }
Пример #6
0
        ///////////////////////////////////////////////////////////////////////////////////////////////

        public ReturnCode Read( /* throw */
            int count,
            CharList endOfLine,
            bool useAnyEndOfLineChar,
            ref ByteList list,
            ref Result error
            )
        {
            CheckDisposed();

            ReturnCode code = ReturnCode.Error;

            if (stream != null)
            {
                //
                // NOTE: Allocate enough for the whole file.
                //
                if (list == null)
                {
                    long length = 0;

                    //
                    // NOTE: Only attempt to query the length of
                    //       seekable streams.
                    //
                    if (stream.CanSeek)
                    {
                        length = stream.Length;
                    }

                    if (length > 0)
                    {
                        list = new ByteList((int)Math.Min(length, MaximumBufferSize));
                    }
                    else
                    {
                        list = new ByteList();
                    }
                }

                //
                // NOTE: Read from the stream in a loop until we hit a terminator
                //       (typically "end of line" or "end of file").
                //
                int  readCount = 0;
                bool eolFound  = false;
                int  eolLength = (endOfLine != null) ? endOfLine.Count : 0;
                int  eolIndex  = 0;

                do
                {
                    int value = ReadByte(stream);

                    //
                    // NOTE: Did we hit the end of the stream?
                    //
                    if (value != EndOfFile)
                    {
                        byte byteValue = ConversionOps.ToByte(value);

                        //
                        // NOTE: Did they supply a valid end-of-line sequence to check
                        //       against?
                        //
                        if ((endOfLine != null) && (eolLength > 0))
                        {
                            //
                            // NOTE: Does the caller want to stop reading as soon as any of
                            //       the supplied end-of-line characters are detected?
                            //
                            if (useAnyEndOfLineChar)
                            {
                                //
                                // NOTE: Does the byte match any of the supplied end-of-line
                                //       characters?
                                //
                                if (endOfLine.Contains(ConversionOps.ToChar(byteValue)))
                                {
                                    eolFound = true;
                                }
                            }
                            else
                            {
                                //
                                // NOTE: Does the byte we just read match the next character in
                                //       the end-of-line sequence we were expecting to see?
                                //
                                if (byteValue == endOfLine[eolIndex])
                                {
                                    //
                                    // NOTE: Have we just match the last character of the end-of-line
                                    //       sequence?  If so, we have found the end-of-line and we
                                    //       are done.
                                    //
                                    if (++eolIndex == eolLength)
                                    {
                                        eolFound = true; /* NOTE: Hit end-of-line sequence. */
                                    }
                                }
                                else if (eolIndex > 0)
                                {
                                    //
                                    // NOTE: Any bytes previously matched against end-of-line sequence
                                    //       characters no longer count because the end-of-line sequence
                                    //       characters must appear consecutively.
                                    //
                                    eolIndex = 0;
                                }
                            }
                        }

                        //
                        // NOTE: Add the byte (which could potentially be part of an end-of-line
                        //       sequence) to the buffer.
                        //
                        list.Add(byteValue);

                        //
                        // NOTE: We just read another byte, keep track.
                        //
                        readCount++;

                        //
                        // NOTE: Now that we have added the byte to the buffer, check to see if we
                        //       hit the end-of-line (above).  If so, remove the end-of-line seuqnece
                        //       from the end of the buffer and bail out.
                        //
                        if (eolFound)
                        {
                            int bufferLength = list.Count;

                            RemoveEndOfLine(list.ToArray(), endOfLine, useAnyEndOfLineChar, ref bufferLength);

                            while (list.Count > bufferLength)
                            {
                                list.RemoveAt(list.Count - 1);
                            }

                            break;
                        }
                    }
                    else
                    {
                        hitEndOfStream = true; /* NOTE: No more data. */
                        break;
                    }
                }while ((count == Count.Invalid) || (readCount < count));

                TranslateEndOfLine(StreamDirection.Input, list, ref list); // TEST: Test this.

                code = ReturnCode.Ok;
            }
            else
            {
                error = "invalid stream";
            }

            return(code);
        }