public void Load(string filePath) { FileName = Path.GetFileName(filePath); FilePath = Path.GetFullPath(filePath); using (FileStream fs = File.Open(filePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { using (BufferedStream bs = new BufferedStream(fs)) { using (StreamReader sr = new StreamReader(bs)) { bool insideBody = false; while (sr.Peek() >= 0) { string tag = string.Empty; if (!insideBody) { do { tag = StreamReaderExtensions.ReadUntil(sr, '>'); if (tag.ToLower().Contains("<body")) { insideBody = true; } } while (!insideBody); } ParseDataItem dataItem = new ParseDataItem(sr, this, null); if (dataItem.HasData) { AddItem(dataItem); } } sr.Dispose(); } bs.Dispose(); } fs.Dispose(); } }
public ParseDataItem(StreamReader sr, ParseDocument parentDocument, ParseDataItem parentDataItem) { ParentDocument = parentDocument; ParentDataItem = parentDataItem; string nextTag = string.Empty; int associatedChildCount = 0; Stack <string> tagStack = new Stack <string>(); do { while (sr.Peek() > 0) { char firstChar; if (string.IsNullOrEmpty(nextTag)) { firstChar = (char)sr.Read(); while (sr.Peek() > 0 && char.IsControl(firstChar)) { firstChar = (char)sr.Read(); } } else { firstChar = nextTag[0]; } string toTest = string.Empty; string toTestNode = firstChar.Equals('/') ? nextTag : string.Empty; if (!string.IsNullOrEmpty(toTestNode)) { if (toTestNode.Contains(" ") || toTestNode.StartsWith("br")) { toTest = toTestNode.Substring(toTestNode.StartsWith("/") ? 1 : 0, toTestNode.StartsWith("/") ? toTestNode.IndexOf(" ") - 1 : toTestNode.IndexOf(" ")).Trim(); } else { toTest = toTestNode.Substring(toTestNode.StartsWith("/") ? 1 : 0, toTestNode.StartsWith("/") ? toTestNode.IndexOf(">") - 1 : toTestNode.IndexOf(">")).Trim(); } } if (firstChar.Equals('<') || ((firstChar.Equals('/') && tagStack.Peek() != DATA_TAG) || (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && HasData && (!string.IsNullOrEmpty(toTest) && !tagsToProcess.Contains(toTest))) || (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && HasData && reservedHeaders.Contains(HTMLDecodedHeader.ToUpper().Trim())) || (firstChar.Equals('/') && tagStack.Peek() == DATA_TAG && !reservedHeaders.Contains(HTMLDecodedHeader.ToUpper().Trim())))) { string tagNode = string.IsNullOrEmpty(nextTag) ? StreamReaderExtensions.ReadUntil(sr, '>') : nextTag.StartsWith("<") ? nextTag.Substring(1) : nextTag; nextTag = string.Empty; bool isOpenNode = !(firstChar.Equals('/') || tagNode.StartsWith("/")); string tag = string.Empty; if (tagNode.Contains(" ") || tagNode.StartsWith("br")) { tag = tagNode.Substring(tagNode.StartsWith("/") ? 1 : 0, tagNode.StartsWith("/") ? tagNode.IndexOf(" ") - 1 : tagNode.IndexOf(" ")).Trim(); } else { tag = tagNode.Substring(tagNode.StartsWith("/") ? 1 : 0, tagNode.StartsWith("/") ? tagNode.IndexOf(">") - 1 : tagNode.IndexOf(">")).Trim(); } if (tagStack.Any() && tagStack.Peek().Trim().ToLower() == DATA_TAG && tag.Trim().ToLower() == TABLE_TAG) { if (isOpenNode) { ParseDataItem newChild = new ParseDataItem(sr, ParentDocument, this); if (newChild.HasData) { AddChild(newChild); associatedChildCount++; } } } else { if (tagsToProcess.Contains(tag)) { if (isOpenNode) { tagStack.Push(tag); } else { if (tagStack.Peek().Equals(tag)) { tagStack.Pop(); } if (tag.Equals(DATA_TAG)) { if (HTMLDecodedValues != null && HTMLDecodedValues.Any()) { ValueCounts.Last().AssociatedChildCount = associatedChildCount; } associatedChildCount = 0; } } } if (!tagStack.Any()) { break; } } } else { if (tagStack != null && tagStack.Count > 0) { bool done = false; int dataLoopCount = 0; do { string control = Int32.TryParse(firstChar.ToString(), out int temp) || firstChar.ToString().ToUpper().Equals("X") || firstChar.ToString().ToUpper().Equals("Y") ? string.Empty : firstChar + "<"; string text = dataLoopCount == 0 ? firstChar + StreamReaderExtensions.ReadUntil(sr, '<') : StreamReaderExtensions.ReadUntil(sr, '<'); if (!text.Equals(control) && !text.StartsWith("br /") && !text.Equals("<")) { text = new string(text.Substring(0, text.Length - 1).Where(c => !char.IsControl(c)).ToArray()); if (!string.IsNullOrEmpty(text)) { if (tagStack.Peek().Trim().ToLower() == HEADER_TAG) { HTMLDecodedHeader = text; } else if (tagStack.Peek().Trim().ToLower() == DATA_TAG) { if (HTMLDecodedValues != null && HTMLDecodedValues.Any()) { ValueCounts.Last().AssociatedChildCount = associatedChildCount; } associatedChildCount = 0; AddValue(text); } } nextTag = StreamReaderExtensions.ReadUntil(sr, '>'); if (!nextTag.StartsWith("br /") && !nextTag.StartsWith("br/")) { done = true; } dataLoopCount++; } else { nextTag = StreamReaderExtensions.ReadUntil(sr, '>'); string tempTag = nextTag; tempTag = tempTag.Replace(">", ""); if (tagsToProcess.Contains(tempTag)) { nextTag = "<" + nextTag; } done = true; } } while (!done); } } } } while (tagStack.Any() && sr.Peek() > 0); }
public void StreamReader_ReadTo_ReaderIsNull_ShouldThrowException() { var result = StreamReaderExtensions.ReadTo(null, out var needle, "Test"); }
public void StreamReader_Find_ReaderIsNull_ShouldThrowException() { StreamReaderExtensions.Find(null, "Test"); }
public static MemoryStream Decompress(Stream data) { try { data.Position = 0; byte[] aklzBuffer = new byte[4]; data.Read(aklzBuffer, 0, 4); if (aklzBuffer[0] != 0x41 || aklzBuffer[1] != 0x4B || aklzBuffer[2] != 0x4C || aklzBuffer[3] != 0x5A) { return(new MemoryStream(StreamReaderExtensions.ToByteArray(data))); } const uint START_INDEX = 0x1000; // Compressed & Decompressed Data Information uint compressedSize = (uint)data.Length; uint decompressedSize = EndianUtil.SwapEndian(StreamReaderExtensions.ReadUInt(data, 0xC)); uint sourcePointer = 0x10; uint destPointer = 0x0; byte[] compressedData = StreamReaderExtensions.ToByteArray(data); byte[] decompressedData = new byte[decompressedSize]; // Start Decompression while (sourcePointer < compressedSize && destPointer < decompressedSize) { byte instruction = compressedData[sourcePointer]; // Compression Flag sourcePointer++; for (int i = 0; i < 8; ++i) { bool copySingleByte = (instruction & 0x01) != 0; instruction >>= 1; if (copySingleByte) // Data is not compressed { decompressedData[destPointer] = compressedData[sourcePointer]; sourcePointer++; destPointer++; } else // Data is compressed { int copyFromAddress = (compressedData[sourcePointer] | ((compressedData[sourcePointer + 1] & 0xF0) << 4)) + 0x12; int Amount = (compressedData[sourcePointer + 1] & 0x0F) + 3; sourcePointer += 2; int memCopyAddress = copyFromAddress; uint wrapCount = destPointer / START_INDEX; for (int wrap = 1; wrap <= wrapCount; ++wrap) { if (copyFromAddress + wrap * START_INDEX < destPointer) { memCopyAddress += (int)START_INDEX; } } if (memCopyAddress > destPointer) { memCopyAddress -= (int)START_INDEX; } // Copy copySize bytes from decompressedData for (int copyIndex = 0; copyIndex < Amount; ++copyIndex, ++memCopyAddress) { if (memCopyAddress < 0) { // This means 0 decompressedData[destPointer] = 0; } else { decompressedData[destPointer] = decompressedData[memCopyAddress]; } ++destPointer; if (destPointer >= decompressedData.Length) { return(new MemoryStream(decompressedData)); } } } // Check for out of range if (sourcePointer >= compressedSize || destPointer >= decompressedSize) { break; } } } return(new MemoryStream(decompressedData)); } catch { return(null); // An error occured while decompressing } }
public static MemoryStream Compress(Stream data) { try { uint DecompressedSize = (uint)data.Length; MemoryStream CompressedData = new MemoryStream(); byte[] DecompressedData = StreamReaderExtensions.ToByteArray(data); uint SourcePointer = 0x0; uint DestPointer = 0x10; // Set up the Lz Compression Dictionary LzWindowDictionary LzDictionary = new LzWindowDictionary(); LzDictionary.SetWindowSize(0x1000); LzDictionary.SetMaxMatchAmount(0xF + 3); // Start compression StreamWriterExtensions.Write(CompressedData, "AKLZ"); byte[] header = new byte[] { 0x7e, 0x3f, 0x51, 0x64, 0x3d, 0xcc, 0xcc, 0xcd }; StreamWriterExtensions.Write(CompressedData, header); StreamWriterExtensions.Write(CompressedData, EndianUtil.SwapEndian(DecompressedSize)); while (SourcePointer < DecompressedSize) { byte Flag = 0x0; uint FlagPosition = DestPointer; CompressedData.WriteByte(Flag); // It will be filled in later DestPointer++; for (int i = 0; i < 8; ++i) { int[] LzSearchMatch = LzDictionary.Search(DecompressedData, SourcePointer, DecompressedSize); if (LzSearchMatch[1] > 0) // There is a compression match { Flag |= (byte)(0 << i); int copySize = LzSearchMatch[1] - 3; int address = LzSearchMatch[0] - 0x12; byte firstByte = (byte)(address & 0x0FF); byte secondByte = (byte)(copySize | ((address & 0xF00) >> 4)); CompressedData.WriteByte(firstByte); CompressedData.WriteByte(secondByte); LzDictionary.AddEntryRange(DecompressedData, (int)SourcePointer, LzSearchMatch[1]); LzDictionary.SlideWindow(LzSearchMatch[1]); SourcePointer += (uint)LzSearchMatch[1]; DestPointer += 2; } else // There wasn't a match { Flag |= (byte)(1 << i); CompressedData.WriteByte(DecompressedData[SourcePointer]); LzDictionary.AddEntry(DecompressedData, (int)SourcePointer); LzDictionary.SlideWindow(1); SourcePointer++; DestPointer++; } // Check for out of bounds if (SourcePointer >= DecompressedSize) { break; } } // Write the flag. // Note that the original position gets reset after writing. CompressedData.Seek(FlagPosition, SeekOrigin.Begin); CompressedData.WriteByte(Flag); CompressedData.Seek(DestPointer, SeekOrigin.Begin); } return(CompressedData); } catch { return(null); // An error occured while compressing } }
/// <summary> /// Constructor /// </summary> /// <param name="wordNetDirectory">Path to WorNet directory (the one with the data and index files in it)</param> /// <param name="inMemory">Whether or not to store all data in memory. In-memory storage requires quite a bit of space /// but it is also very quick. The alternative (false) will cause the data to be searched on-disk with an efficient /// binary search algorithm.</param> public WordNetEngine(string wordNetDirectory, bool inMemory) { _wordNetDirectory = wordNetDirectory; _inMemory = inMemory; _posIndexWordSearchStream = null; _posSynSetDataFile = null; if (!System.IO.Directory.Exists(_wordNetDirectory)) { throw new DirectoryNotFoundException("Non-existent WordNet directory: " + _wordNetDirectory); } // get data and index paths string[] dataPaths = new string[] { Path.Combine(_wordNetDirectory, "data.adj"), Path.Combine(_wordNetDirectory, "data.adv"), Path.Combine(_wordNetDirectory, "data.noun"), Path.Combine(_wordNetDirectory, "data.verb") }; string[] indexPaths = new string[] { Path.Combine(_wordNetDirectory, "index.adj"), Path.Combine(_wordNetDirectory, "index.adv"), Path.Combine(_wordNetDirectory, "index.noun"), Path.Combine(_wordNetDirectory, "index.verb") }; // make sure all files exist foreach (string path in Enumerable.Union(dataPaths, indexPaths)) //ZK change to static method { if (!System.IO.File.Exists(path)) { throw new FileNotFoundException("Failed to find WordNet file: " + path); } } #region index file sorting string sortFlagPath = Path.Combine(_wordNetDirectory, ".sorted_for_dot_net"); if (!System.IO.File.Exists(sortFlagPath)) { /* make sure the index files are sorted according to the current sort order. the index files in the * wordnet distribution are sorted in the order needed for (presumably) the java api, which uses * a different sort order than the .net runtime. thus, unless we resort the lines in the index * files, we won't be able to do a proper binary search over the data. */ foreach (string indexPath in indexPaths) { // create temporary file for sorted lines string tempPath = Path.GetTempFileName(); StreamWriter tempFile = new StreamWriter(tempPath); // get number of words (lines) in file int numWords = 0; StreamReader indexFile = new StreamReader(indexPath); string line; while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method { if (!line.StartsWith(" ")) { ++numWords; } } // get lines in file, sorted by first column (i.e., the word) Dictionary <string, string> wordLine = new Dictionary <string, string>(numWords); indexFile = new StreamReader(indexPath); while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method // write header lines to temp file immediately { if (line.StartsWith(" ")) { tempFile.WriteLine(line); } else { // trim useless blank spaces from line and map line to first column line = line.Trim(); wordLine.Add(line.Substring(0, line.IndexOf(' ')), line); } } // get sorted words List <string> sortedWords = new List <string>(wordLine.Count); sortedWords.AddRange(wordLine.Keys); sortedWords.Sort(); // write lines sorted by word foreach (string word in sortedWords) { tempFile.WriteLine(wordLine[word]); } tempFile.Close(); // replace original index file with properly sorted one System.IO.File.Delete(indexPath); System.IO.File.Move(tempPath, indexPath); } // create flag file, indicating that we've sorted the data StreamWriter sortFlagFile = new StreamWriter(sortFlagPath); sortFlagFile.WriteLine("This file serves no purpose other than to indicate that the WordNet distribution data in the current directory has been sorted for use by the .NET API."); sortFlagFile.Close(); } #endregion #region engine init if (inMemory) { // pass 1: get total number of synsets int totalSynsets = 0; foreach (string dataPath in dataPaths) { // scan synset data file for lines that don't start with a space...these are synset definition lines StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { ++totalSynsets; } } } // pass 2: create synset shells (pos and offset only) _idSynset = new Dictionary <string, SynSet>(totalSynsets); foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // get offset and create synset shell int offset = int.Parse(line.Substring(0, firstSpace)); SynSet synset = new SynSet(pos, offset, null); _idSynset.Add(synset.ID, synset); } } } // pass 3: instantiate synsets (hooks up relations, set glosses, etc.) foreach (string dataPath in dataPaths) { POS pos = GetFilePOS(dataPath); // scan synset data file StreamReader dataFile = new StreamReader(dataPath); string line; while (StreamReaderExtensions.TryReadLine(dataFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // instantiate synset defined on current line, using the instantiated synsets for all references _idSynset[pos + ":" + int.Parse(line.Substring(0, firstSpace))].Instantiate(line, _idSynset); } } } // organize synsets by pos and words...also set most common synset for word-pos pairs that have multiple synsets _posWordSynSets = new Dictionary <POS, Dictionary <string, Set <SynSet> > >(); foreach (string indexPath in indexPaths) { POS pos = GetFilePOS(indexPath); DictionaryExtensions.EnsureContainsKey(_posWordSynSets, pos, typeof(Dictionary <string, Set <SynSet> >)); //ZK change to static method // scan word index file, skipping header lines StreamReader indexFile = new StreamReader(indexPath); string line; while (StreamReaderExtensions.TryReadLine(indexFile, out line)) //ZK change to static method { int firstSpace = line.IndexOf(' '); if (firstSpace > 0) { // grab word and synset shells, along with the most common synset string word = line.Substring(0, firstSpace); SynSet mostCommonSynSet; Set <SynSet> synsets = GetSynSetShells(line, pos, out mostCommonSynSet, null); // set flag on most common synset if it's ambiguous if (synsets.Count > 1) { _idSynset[mostCommonSynSet.ID].SetAsMostCommonSynsetFor(word); } // use reference to the synsets that we instantiated in our three-pass routine above _posWordSynSets[pos].Add(word, new Set <SynSet>(synsets.Count)); foreach (SynSet synset in synsets) { _posWordSynSets[pos][word].Add(_idSynset[synset.ID]); } } } } } else { // open binary search streams for index files _posIndexWordSearchStream = new Dictionary <POS, BinarySearchTextStream>(); foreach (string indexPath in indexPaths) { // create binary search stream for index file BinarySearchTextStream searchStream = new BinarySearchTextStream(indexPath, new BinarySearchTextStream.SearchComparisonDelegate(delegate(object searchWord, string currentLine) { // if we landed on the header text, search further down if (currentLine[0] == ' ') { return(1); } // get word on current line string currentWord = currentLine.Substring(0, currentLine.IndexOf(' ')); // compare searched-for word to the current word return(((string)searchWord).CompareTo(currentWord)); })); // add search stream for current POS _posIndexWordSearchStream.Add(GetFilePOS(indexPath), searchStream); } // open readers for synset data files _posSynSetDataFile = new Dictionary <POS, StreamReader>(); foreach (string dataPath in dataPaths) { _posSynSetDataFile.Add(GetFilePOS(dataPath), new StreamReader(dataPath)); } } #endregion }