Example #1
0
        public void Load(string filePath)
        {
            FileName = Path.GetFileName(filePath);
            FilePath = Path.GetFullPath(filePath);

            using (FileStream fs = File.Open(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
            {
                using (BufferedStream bs = new BufferedStream(fs))
                {
                    using (StreamReader sr = new StreamReader(bs))
                    {
                        bool insideBody = false;
                        while (sr.Peek() >= 0)
                        {
                            string tag = string.Empty;
                            if (!insideBody)
                            {
                                do
                                {
                                    tag = StreamReaderExtensions.ReadUntil(sr, '>');
                                    if (tag.ToLower().Contains("<body"))
                                    {
                                        insideBody = true;
                                    }
                                } while (!insideBody);
                            }

                            ParseDataItem dataItem = new ParseDataItem(sr, this, null);
                            if (dataItem.HasData)
                            {
                                AddItem(dataItem);
                            }
                        }
                        sr.Dispose();
                    }
                    bs.Dispose();
                }
                fs.Dispose();
            }
        }
Example #2
0
        public ParseDataItem(StreamReader sr, ParseDocument parentDocument, ParseDataItem parentDataItem)
        {
            ParentDocument = parentDocument;
            ParentDataItem = parentDataItem;

            string         nextTag = string.Empty;
            int            associatedChildCount = 0;
            Stack <string> tagStack             = new Stack <string>();

            do
            {
                while (sr.Peek() > 0)
                {
                    char firstChar;
                    if (string.IsNullOrEmpty(nextTag))
                    {
                        firstChar = (char)sr.Read();
                        while (sr.Peek() > 0 && char.IsControl(firstChar))
                        {
                            firstChar = (char)sr.Read();
                        }
                    }
                    else
                    {
                        firstChar = nextTag[0];
                    }

                    string toTest     = string.Empty;
                    string toTestNode = firstChar.Equals('/') ? nextTag : string.Empty;
                    if (!string.IsNullOrEmpty(toTestNode))
                    {
                        if (toTestNode.Contains(" ") || toTestNode.StartsWith("br"))
                        {
                            toTest = toTestNode.Substring(toTestNode.StartsWith("/") ? 1 : 0, toTestNode.StartsWith("/") ? toTestNode.IndexOf(" ") - 1 : toTestNode.IndexOf(" ")).Trim();
                        }
                        else
                        {
                            toTest = toTestNode.Substring(toTestNode.StartsWith("/") ? 1 : 0, toTestNode.StartsWith("/") ? toTestNode.IndexOf(">") - 1 : toTestNode.IndexOf(">")).Trim();
                        }
                    }
                    if (firstChar.Equals('<') ||
                        ((firstChar.Equals('/') && tagStack.Peek() != DATA_TAG) ||
                         (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && HasData && (!string.IsNullOrEmpty(toTest) && !tagsToProcess.Contains(toTest))) ||
                         (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && HasData && reservedHeaders.Contains(HTMLDecodedHeader.ToUpper().Trim())) ||
                         (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && !reservedHeaders.Contains(HTMLDecodedHeader.ToUpper().Trim()))))
                    {
                        string tagNode = string.IsNullOrEmpty(nextTag) ? StreamReaderExtensions.ReadUntil(sr, '>') : nextTag.StartsWith("<") ? nextTag.Substring(1) : nextTag;
                        nextTag = string.Empty;

                        bool   isOpenNode = !(firstChar.Equals('/') || tagNode.StartsWith("/"));
                        string tag        = string.Empty;
                        if (tagNode.Contains(" ") || tagNode.StartsWith("br"))
                        {
                            tag = tagNode.Substring(tagNode.StartsWith("/") ? 1 : 0, tagNode.StartsWith("/") ? tagNode.IndexOf(" ") - 1 : tagNode.IndexOf(" ")).Trim();
                        }
                        else
                        {
                            tag = tagNode.Substring(tagNode.StartsWith("/") ? 1 : 0, tagNode.StartsWith("/") ? tagNode.IndexOf(">") - 1 : tagNode.IndexOf(">")).Trim();
                        }

                        if (tagStack.Any() && tagStack.Peek().Trim().ToLower() == DATA_TAG && tag.Trim().ToLower() == TABLE_TAG)
                        {
                            if (isOpenNode)
                            {
                                ParseDataItem newChild = new ParseDataItem(sr, ParentDocument, this);
                                if (newChild.HasData)
                                {
                                    AddChild(newChild);
                                    associatedChildCount++;
                                }
                            }
                        }
                        else
                        {
                            if (tagsToProcess.Contains(tag))
                            {
                                if (isOpenNode)
                                {
                                    tagStack.Push(tag);
                                }
                                else
                                {
                                    if (tagStack.Peek().Equals(tag))
                                    {
                                        tagStack.Pop();
                                    }
                                    if (tag.Equals(DATA_TAG))
                                    {
                                        if (HTMLDecodedValues != null && HTMLDecodedValues.Any())
                                        {
                                            ValueCounts.Last().AssociatedChildCount = associatedChildCount;
                                        }
                                        associatedChildCount = 0;
                                    }
                                }
                            }
                            if (!tagStack.Any())
                            {
                                break;
                            }
                        }
                    }
                    else
                    {
                        if (tagStack != null && tagStack.Count > 0)
                        {
                            bool done          = false;
                            int  dataLoopCount = 0;
                            do
                            {
                                string control = Int32.TryParse(firstChar.ToString(), out int temp) || firstChar.ToString().ToUpper().Equals("X") ||
                                                 firstChar.ToString().ToUpper().Equals("Y") ? string.Empty : firstChar + "<";
                                string text = dataLoopCount == 0 ? firstChar + StreamReaderExtensions.ReadUntil(sr, '<') : StreamReaderExtensions.ReadUntil(sr, '<');
                                if (!text.Equals(control) && !text.StartsWith("br /") && !text.Equals("<"))
                                {
                                    text = new string(text.Substring(0, text.Length - 1).Where(c => !char.IsControl(c)).ToArray());
                                    if (!string.IsNullOrEmpty(text))
                                    {
                                        if (tagStack.Peek().Trim().ToLower() == HEADER_TAG)
                                        {
                                            HTMLDecodedHeader = text;
                                        }
                                        else if (tagStack.Peek().Trim().ToLower() == DATA_TAG)
                                        {
                                            if (HTMLDecodedValues != null && HTMLDecodedValues.Any())
                                            {
                                                ValueCounts.Last().AssociatedChildCount = associatedChildCount;
                                            }
                                            associatedChildCount = 0;
                                            AddValue(text);
                                        }
                                    }
                                    nextTag = StreamReaderExtensions.ReadUntil(sr, '>');
                                    if (!nextTag.StartsWith("br /") && !nextTag.StartsWith("br/"))
                                    {
                                        done = true;
                                    }
                                    dataLoopCount++;
                                }
                                else
                                {
                                    nextTag = StreamReaderExtensions.ReadUntil(sr, '>');

                                    string tempTag = nextTag;
                                    tempTag = tempTag.Replace(">", "");
                                    if (tagsToProcess.Contains(tempTag))
                                    {
                                        nextTag = "<" + nextTag;
                                    }
                                    done = true;
                                }
                            } while (!done);
                        }
                    }
                }
            } while (tagStack.Any() && sr.Peek() > 0);
        }
 public void StreamReader_ReadTo_ReaderIsNull_ShouldThrowException()
 {
     var result = StreamReaderExtensions.ReadTo(null, out var needle, "Test");
 }
 public void StreamReader_Find_ReaderIsNull_ShouldThrowException()
 {
     StreamReaderExtensions.Find(null, "Test");
 }
Example #5
0
        public static MemoryStream Decompress(Stream data)
        {
            try
            {
                data.Position = 0;
                byte[] aklzBuffer = new byte[4];
                data.Read(aklzBuffer, 0, 4);
                if (aklzBuffer[0] != 0x41 ||
                    aklzBuffer[1] != 0x4B ||
                    aklzBuffer[2] != 0x4C ||
                    aklzBuffer[3] != 0x5A)
                {
                    return(new MemoryStream(StreamReaderExtensions.ToByteArray(data)));
                }
                const uint START_INDEX = 0x1000;
                // Compressed & Decompressed Data Information
                uint compressedSize   = (uint)data.Length;
                uint decompressedSize = EndianUtil.SwapEndian(StreamReaderExtensions.ReadUInt(data, 0xC));

                uint sourcePointer = 0x10;
                uint destPointer   = 0x0;

                byte[] compressedData   = StreamReaderExtensions.ToByteArray(data);
                byte[] decompressedData = new byte[decompressedSize];

                // Start Decompression
                while (sourcePointer < compressedSize && destPointer < decompressedSize)
                {
                    byte instruction = compressedData[sourcePointer]; // Compression Flag
                    sourcePointer++;

                    for (int i = 0; i < 8; ++i)
                    {
                        bool copySingleByte = (instruction & 0x01) != 0;
                        instruction >>= 1;
                        if (copySingleByte) // Data is not compressed
                        {
                            decompressedData[destPointer] = compressedData[sourcePointer];
                            sourcePointer++;
                            destPointer++;
                        }
                        else // Data is compressed
                        {
                            int copyFromAddress = (compressedData[sourcePointer] | ((compressedData[sourcePointer + 1] & 0xF0) << 4)) + 0x12;
                            int Amount          = (compressedData[sourcePointer + 1] & 0x0F) + 3;
                            sourcePointer += 2;

                            int  memCopyAddress = copyFromAddress;
                            uint wrapCount      = destPointer / START_INDEX;
                            for (int wrap = 1; wrap <= wrapCount; ++wrap)
                            {
                                if (copyFromAddress + wrap * START_INDEX < destPointer)
                                {
                                    memCopyAddress += (int)START_INDEX;
                                }
                            }

                            if (memCopyAddress > destPointer)
                            {
                                memCopyAddress -= (int)START_INDEX;
                            }

                            // Copy copySize bytes from decompressedData
                            for (int copyIndex = 0; copyIndex < Amount; ++copyIndex, ++memCopyAddress)
                            {
                                if (memCopyAddress < 0)
                                {
                                    // This means 0
                                    decompressedData[destPointer] = 0;
                                }
                                else
                                {
                                    decompressedData[destPointer] = decompressedData[memCopyAddress];
                                }
                                ++destPointer;
                                if (destPointer >= decompressedData.Length)
                                {
                                    return(new MemoryStream(decompressedData));
                                }
                            }
                        }

                        // Check for out of range
                        if (sourcePointer >= compressedSize || destPointer >= decompressedSize)
                        {
                            break;
                        }
                    }
                }

                return(new MemoryStream(decompressedData));
            }
            catch
            {
                return(null); // An error occured while decompressing
            }
        }
Example #6
0
        public static MemoryStream Compress(Stream data)
        {
            try
            {
                uint DecompressedSize = (uint)data.Length;

                MemoryStream CompressedData   = new MemoryStream();
                byte[]       DecompressedData = StreamReaderExtensions.ToByteArray(data);

                uint SourcePointer = 0x0;
                uint DestPointer   = 0x10;

                // Set up the Lz Compression Dictionary
                LzWindowDictionary LzDictionary = new LzWindowDictionary();
                LzDictionary.SetWindowSize(0x1000);
                LzDictionary.SetMaxMatchAmount(0xF + 3);

                // Start compression
                StreamWriterExtensions.Write(CompressedData, "AKLZ");
                byte[] header = new byte[] { 0x7e, 0x3f, 0x51, 0x64, 0x3d, 0xcc, 0xcc, 0xcd };
                StreamWriterExtensions.Write(CompressedData, header);
                StreamWriterExtensions.Write(CompressedData, EndianUtil.SwapEndian(DecompressedSize));
                while (SourcePointer < DecompressedSize)
                {
                    byte Flag         = 0x0;
                    uint FlagPosition = DestPointer;
                    CompressedData.WriteByte(Flag); // It will be filled in later
                    DestPointer++;

                    for (int i = 0; i < 8; ++i)
                    {
                        int[] LzSearchMatch = LzDictionary.Search(DecompressedData, SourcePointer, DecompressedSize);
                        if (LzSearchMatch[1] > 0) // There is a compression match
                        {
                            Flag |= (byte)(0 << i);

                            int  copySize   = LzSearchMatch[1] - 3;
                            int  address    = LzSearchMatch[0] - 0x12;
                            byte firstByte  = (byte)(address & 0x0FF);
                            byte secondByte = (byte)(copySize | ((address & 0xF00) >> 4));
                            CompressedData.WriteByte(firstByte);
                            CompressedData.WriteByte(secondByte);

                            LzDictionary.AddEntryRange(DecompressedData, (int)SourcePointer, LzSearchMatch[1]);
                            LzDictionary.SlideWindow(LzSearchMatch[1]);

                            SourcePointer += (uint)LzSearchMatch[1];
                            DestPointer   += 2;
                        }
                        else // There wasn't a match
                        {
                            Flag |= (byte)(1 << i);

                            CompressedData.WriteByte(DecompressedData[SourcePointer]);

                            LzDictionary.AddEntry(DecompressedData, (int)SourcePointer);
                            LzDictionary.SlideWindow(1);

                            SourcePointer++;
                            DestPointer++;
                        }

                        // Check for out of bounds
                        if (SourcePointer >= DecompressedSize)
                        {
                            break;
                        }
                    }

                    // Write the flag.
                    // Note that the original position gets reset after writing.
                    CompressedData.Seek(FlagPosition, SeekOrigin.Begin);
                    CompressedData.WriteByte(Flag);
                    CompressedData.Seek(DestPointer, SeekOrigin.Begin);
                }

                return(CompressedData);
            }
            catch
            {
                return(null); // An error occured while compressing
            }
        }
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param>
        /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space
        /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient
        /// binary search algorithm.</param>
        public WordNetEngine(string wordNetDirectory, bool inMemory)
        {
            _wordNetDirectory         = wordNetDirectory;
            _inMemory                 = inMemory;
            _posIndexWordSearchStream = null;
            _posSynSetDataFile        = null;

            if (!System.IO.Directory.Exists(_wordNetDirectory))
            {
                throw new DirectoryNotFoundException("Non-existent WordNet directory:  " + _wordNetDirectory);
            }

            // get data and index paths
            string[] dataPaths = new string[]
            {
                Path.Combine(_wordNetDirectory, "data.adj"),
                Path.Combine(_wordNetDirectory, "data.adv"),
                Path.Combine(_wordNetDirectory, "data.noun"),
                Path.Combine(_wordNetDirectory, "data.verb")
            };

            string[] indexPaths = new string[]
            {
                Path.Combine(_wordNetDirectory, "index.adj"),
                Path.Combine(_wordNetDirectory, "index.adv"),
                Path.Combine(_wordNetDirectory, "index.noun"),
                Path.Combine(_wordNetDirectory, "index.verb")
            };

            // make sure all files exist
            foreach (string path in Enumerable.Union(dataPaths, indexPaths)) //ZK change to static method
            {
                if (!System.IO.File.Exists(path))
                {
                    throw new FileNotFoundException("Failed to find WordNet file:  " + path);
                }
            }

            #region index file sorting
            string sortFlagPath = Path.Combine(_wordNetDirectory, ".sorted_for_dot_net");
            if (!System.IO.File.Exists(sortFlagPath))
            {
                /* make sure the index files are sorted according to the current sort order. the index files in the
                 * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses
                 * a different sort order than the .net runtime. thus, unless we resort the lines in the index
                 * files, we won't be able to do a proper binary search over the data. */
                foreach (string indexPath in indexPaths)
                {
                    // create temporary file for sorted lines
                    string       tempPath = Path.GetTempFileName();
                    StreamWriter tempFile = new StreamWriter(tempPath);

                    // get number of words (lines) in file
                    int          numWords  = 0;
                    StreamReader indexFile = new StreamReader(indexPath);
                    string       line;
                    while (StreamReaderExtensions.TryReadLine(indexFile, out line))                      //ZK change to static method
                    {
                        if (!line.StartsWith(" "))
                        {
                            ++numWords;
                        }
                    }

                    // get lines in file, sorted by first column (i.e., the word)
                    Dictionary <string, string> wordLine = new Dictionary <string, string>(numWords);
                    indexFile = new StreamReader(indexPath);
                    while (StreamReaderExtensions.TryReadLine(indexFile, out line))                     //ZK change to static method
                    // write header lines to temp file immediately
                    {
                        if (line.StartsWith(" "))
                        {
                            tempFile.WriteLine(line);
                        }
                        else
                        {
                            // trim useless blank spaces from line and map line to first column
                            line = line.Trim();
                            wordLine.Add(line.Substring(0, line.IndexOf(' ')), line);
                        }
                    }

                    // get sorted words
                    List <string> sortedWords = new List <string>(wordLine.Count);
                    sortedWords.AddRange(wordLine.Keys);
                    sortedWords.Sort();

                    // write lines sorted by word
                    foreach (string word in sortedWords)
                    {
                        tempFile.WriteLine(wordLine[word]);
                    }

                    tempFile.Close();

                    // replace original index file with properly sorted one
                    System.IO.File.Delete(indexPath);
                    System.IO.File.Move(tempPath, indexPath);
                }

                // create flag file, indicating that we've sorted the data
                StreamWriter sortFlagFile = new StreamWriter(sortFlagPath);
                sortFlagFile.WriteLine("This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API.");
                sortFlagFile.Close();
            }
            #endregion

            #region engine init
            if (inMemory)
            {
                // pass 1:  get total number of synsets
                int totalSynsets = 0;
                foreach (string dataPath in dataPaths)
                {
                    // scan synset data file for lines that don't start with a space...these are synset definition lines
                    StreamReader dataFile = new StreamReader(dataPath);
                    string       line;
                    while (StreamReaderExtensions.TryReadLine(dataFile, out line))                     //ZK change to static method
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            ++totalSynsets;
                        }
                    }
                }

                // pass 2:  create synset shells (pos and offset only)
                _idSynset = new Dictionary <string, SynSet>(totalSynsets);
                foreach (string dataPath in dataPaths)
                {
                    POS pos = GetFilePOS(dataPath);

                    // scan synset data file
                    StreamReader dataFile = new StreamReader(dataPath);
                    string       line;
                    while (StreamReaderExtensions.TryReadLine(dataFile, out line))                     //ZK change to static method
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // get offset and create synset shell
                            int    offset = int.Parse(line.Substring(0, firstSpace));
                            SynSet synset = new SynSet(pos, offset, null);

                            _idSynset.Add(synset.ID, synset);
                        }
                    }
                }

                // pass 3:  instantiate synsets (hooks up relations, set glosses, etc.)
                foreach (string dataPath in dataPaths)
                {
                    POS pos = GetFilePOS(dataPath);

                    // scan synset data file
                    StreamReader dataFile = new StreamReader(dataPath);
                    string       line;
                    while (StreamReaderExtensions.TryReadLine(dataFile, out line))                     //ZK change to static method
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // instantiate synset defined on current line, using the instantiated synsets for all references
                            _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset);
                        }
                    }
                }

                // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets
                _posWordSynSets = new Dictionary <POS, Dictionary <string, Set <SynSet> > >();
                foreach (string indexPath in indexPaths)
                {
                    POS pos = GetFilePOS(indexPath);

                    DictionaryExtensions.EnsureContainsKey(_posWordSynSets, pos, typeof(Dictionary <string, Set <SynSet> >));                  //ZK change to static method

                    // scan word index file, skipping header lines
                    StreamReader indexFile = new StreamReader(indexPath);
                    string       line;
                    while (StreamReaderExtensions.TryReadLine(indexFile, out line))                     //ZK change to static method
                    {
                        int firstSpace = line.IndexOf(' ');
                        if (firstSpace > 0)
                        {
                            // grab word and synset shells, along with the most common synset
                            string       word = line.Substring(0, firstSpace);
                            SynSet       mostCommonSynSet;
                            Set <SynSet> synsets = GetSynSetShells(line, pos, out mostCommonSynSet, null);

                            // set flag on most common synset if it's ambiguous
                            if (synsets.Count > 1)
                            {
                                _idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word);
                            }

                            // use reference to the synsets that we instantiated in our three-pass routine above
                            _posWordSynSets[pos].Add(word, new Set <SynSet>(synsets.Count));
                            foreach (SynSet synset in synsets)
                            {
                                _posWordSynSets[pos][word].Add(_idSynset[synset.ID]);
                            }
                        }
                    }
                }
            }
            else
            {
                // open binary search streams for index files
                _posIndexWordSearchStream = new Dictionary <POS, BinarySearchTextStream>();
                foreach (string indexPath in indexPaths)
                {
                    // create binary search stream for index file
                    BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(delegate(object searchWord, string currentLine)
                    {
                        // if we landed on the header text, search further down
                        if (currentLine[0] == ' ')
                        {
                            return(1);
                        }

                        // get word on current line
                        string currentWord = currentLine.Substring(0, currentLine.IndexOf(' '));

                        // compare searched-for word to the current word
                        return(((string)searchWord).CompareTo(currentWord));
                    }));

                    // add search stream for current POS
                    _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream);
                }

                // open readers for synset data files
                _posSynSetDataFile = new Dictionary <POS, StreamReader>();
                foreach (string dataPath in dataPaths)
                {
                    _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath));
                }
            }
            #endregion
        }