public TrieIndexHeader Deserialize(Stream stream) { var header = new TrieIndexHeader(); var properties = GetProperties(header); using (var reader = new StreamReader(stream, Encoding.UTF8)) { while (reader.Peek() > -1) { string[] keyValue = reader.ReadLine().Split(KeyValueSeperator); string key = keyValue[0]; var property = properties.SingleOrDefault(f => f.Name == key); if (property == null) { throw new Exception("Property not found"); } object propertyValue = DeserializeValue(keyValue[1], property.PropertyType); property.SetValue(header, propertyValue); } } return(header); }
public void get_character_index_must_be_null() { var header = new TrieIndexHeader(); var characterIndex = TrieIndexHeaderCharacterReader.Instance.GetCharacterIndex(header, 'a'); Assert.AreEqual(null, characterIndex); }
internal long[] GetChildrenPositionsFromNode(BinaryReader binaryReader, TrieIndexHeader Header, long parentPosition) { int childrenOffset = ReadChildrenOffset(parentPosition); if (childrenOffset == 0) // -1 equals to non-childed parent { return(null); } bool[] childrenFlags = ReadChildrenFlags(parentPosition); int childrenCount = GetFlaggedCount(childrenFlags, true); long[] childrenPositions = new long[childrenCount]; for (int i = 0; i < childrenCount; i++) { long targetPosition = parentPosition + childrenOffset + (i * Header.LENGTH_OF_STRUCT); childrenPositions[i] = targetPosition; } return(childrenPositions); }
public void character_list_must_be_initialized_when_new_instance_was_created() { var header = new TrieIndexHeader(); Assert.IsNotNull(header); Assert.IsNotNull(header.CharacterList); Assert.AreEqual(0, header.CharacterList.Count); }
private TrieIndexHeaderBuilder SortCharacterList(ref TrieIndexHeader header) { if (header == null || _characterList == null) { throw new ArgumentNullException("header"); } _characterList.Sort(new TrieCharacterComparer()); return(this); }
public IndexBuilder(Stream headerStream, Stream indexStream, Stream tailStream) { _headerStream = headerStream; _indexStream = indexStream; _tailStream = tailStream; _header = new TrieIndexHeader(); _trie = new Trie(); _keywords = new HashSet <string>(); _keywordDictionary = new Dictionary <string, uint>(); }
/// <summary> /// Gets the index of the character. /// Returns null when charcacter is not found. /// </summary> /// <returns>The character index.</returns> /// <param name="c">C.</param> internal ushort?GetCharacterIndex(TrieIndexHeader header, char c) { InitCharacterCache(header); if (!_characterIndexDictionary[header].ContainsKey(c)) { return(null); } return(_characterIndexDictionary[header][c]); }
internal TrieIndexHeader Build() { var header = new TrieIndexHeader(); header.CharacterList = _characterList; SortCharacterList(ref header); CalculateMetrics(ref header); return(header); }
public static int CreateIndexFile(this TrieBinaryReader instance, TrieIndexHeader header, TrieNode node, string path, int readBufferSizeInBytes) { Stream stream = new FileStream( path, FileMode.OpenOrCreate, FileAccess.Write, FileShare.None, readBufferSizeInBytes, FileOptions.RandomAccess ); return(TrieIndexSerializer.Serialize(node, header, stream)); }
public static void CreateHeaderFile(this TrieIndexHeader header, string path) { var serializer = new TrieIndexHeaderSerializer(); using (Stream stream = new FileStream( path, FileMode.OpenOrCreate, FileAccess.Write, FileShare.None )) { serializer.Serialize(stream, header); } }
internal ICollection <UInt16> GetFlagedCharCodes(BitArray bitArray, TrieIndexHeader Header, bool flag) { ICollection <UInt16> charCodeList = new List <UInt16>(Header.COUNT_OF_CHILDREN_FLAGS_IN_BYTES); // TODO: use different constant for (UInt16 i = 0; i < bitArray.Length; i++) { if (bitArray.Get(i) == flag) { charCodeList.Add(i); // TODO: use mapping } } return(charCodeList); }
public static void CreateHeaderFile(this TrieIndexHeader header, string path) { Stream stream = new FileStream( path, FileMode.OpenOrCreate, FileAccess.Write, FileShare.None ); TrieSerializer.SerializeHeaderWithJsonSerializer(stream, header); //stream.Close(); stream.Dispose(); stream = null; }
internal ICollection <char> GetFlagedChars(BitArray bitArray, TrieIndexHeader Header, bool flag) { ICollection <char> charList = new List <char>(); ICollection <UInt16> charCodes = GetFlagedCharCodes(bitArray, Header, flag); if (charCodes != null) { foreach (UInt16 item in charCodes) { char currentCharacter = TrieIndexHeaderCharacterReader.Instance.GetCharacterAtIndex(Header, item); charList.Add(currentCharacter); } } return(charList); }
private void ReorderTrieAndLoadHeader(TrieNode rootNode) { TrieIndexHeader header = new TrieIndexHeader(); Queue <TrieNode> indexerQueue = new Queue <TrieNode>(); indexerQueue.Enqueue(rootNode); int order = 0; var builder = new TrieIndexHeaderBuilder(); TrieNode currentNode = null; while (indexerQueue.Count > 0) { currentNode = indexerQueue.Dequeue(); if (currentNode == null) { throw new ArgumentNullException("Root node is null"); } currentNode.Order = order; builder.AddChar(currentNode.Character); // set parent's children index when current node's child // index not equal to zero and current index is not the root if (currentNode.Parent != null && currentNode.ChildIndex == 0) { currentNode.Parent.ChildrenCount = (currentNode.Order - currentNode.Parent.Order); } if (currentNode.Children != null) { int childIndex = 0; foreach (var childNode in currentNode.Children) { childNode.Value.ChildIndex = childIndex++; indexerQueue.Enqueue(childNode.Value); } } ++order; } _header = builder.Build(); }
public SearchResult Search(SearchOptions options) { if (options == null) { throw new ArgumentException("options"); } if (_header == null) { _header = GetHeader(); } var reader = CreateTrieBinaryReader(); var node = reader.SearchLastNode(0, options.Term); return(CreateResultFromNode(reader, node, options.Term, options)); }
public void Serialize(Stream stream, TrieIndexHeader header) { var properties = GetProperties(header); using (var writer = new StreamWriter(stream, Encoding.UTF8)) { foreach (var property in properties) { writer.Write(property.Name); writer.Write(KeyValueSeperator); var propertyValue = property.GetValue(header); SerializePropertyValue(propertyValue, property.PropertyType, writer); writer.Write(Environment.NewLine); } } }
internal override TrieIndexHeader GetHeader() { // double checked initialization if (!_headerDictionary.ContainsKey(_headerFileName)) { lock (_lockObject) { if (!_headerDictionary.ContainsKey(_headerFileName)) { var currentHeader = TrieNodeHelperFileSystemExtensions.ReadHeaderFile(_headerFileName); _headerDictionary.Add(_headerFileName, currentHeader); } } } TrieIndexHeader header = _headerDictionary[_headerFileName]; return(header); }
private void CalculateMetrics(ref TrieIndexHeader header) { // Set structural based properties header.COUNT_OF_CHARSET = _characterList.Count; header.COUNT_OF_CHILDREN_FLAGS = header.COUNT_OF_CHARSET / 8 + (header.COUNT_OF_CHARSET % 8 == 0 ? 0 : 1); header.COUNT_OF_CHILDREN_FLAGS_IN_BYTES = header.COUNT_OF_CHARSET / 32 + (header.COUNT_OF_CHARSET % 32 == 0 ? 0 : 1); header.COUNT_OF_CHILDREN_FLAGS_BIT_ARRAY_IN_BYTES = header.COUNT_OF_CHILDREN_FLAGS_IN_BYTES * 4; header.LENGTH_OF_CHILDREN_FLAGS = header.COUNT_OF_CHARACTER_IN_BYTES + // 2 header.COUNT_TERMINAL_SIZE_IN_BYTES; // 1; header.LENGTH_OF_CHILDREN_OFFSET = header.LENGTH_OF_CHILDREN_FLAGS + // 2 header.COUNT_OF_CHILDREN_FLAGS_BIT_ARRAY_IN_BYTES; header.LENGHT_OF_TEXT_FILE_START_POSITION_IN_BYTES = header.LENGTH_OF_CHILDREN_OFFSET + header.COUNT_OF_TEXT_FILE_START_POSITION_IN_BYTES; header.LENGTH_OF_STRUCT = header.LENGHT_OF_TEXT_FILE_START_POSITION_IN_BYTES + header.COUNT_OF_CHILDREN_OFFSET_IN_BYTES; }
internal void InitCharacterCache(TrieIndexHeader header) { if (!_isCharacterIndexCacheInitialized.ContainsKey(header)) { lock (this) { if (!_isCharacterIndexCacheInitialized.ContainsKey(header)) { _isCharacterIndexCacheInitialized.Add(header, true); _characterIndexDictionary.Add(header, new Dictionary <char, UInt16>()); for (UInt16 i = 0; i < header.CharacterList.Count; i++) { if (header.CharacterList[i] != '\0') { _characterIndexDictionary[header].Add(header.CharacterList[i], i); } } } } } }
internal char GetCharacterAtIndex(TrieIndexHeader header, UInt16 index) { InitCharacterCache(header); return(header.CharacterList[index]); }
public TrieBinaryReader(BinaryReader binaryReader, TrieIndexHeader header) { _binaryReader = binaryReader; _header = header; }
private IEnumerable <PropertyInfo> GetProperties(TrieIndexHeader header) { return(header.GetType().GetRuntimeProperties()); }
/// <summary> /// /// </summary> /// <param name="rootNode"></param> /// <param name="trieIndexHeader"></param> /// <param name="index"></param> /// <remarks>Don't forget to dispose stream</remarks> /// <returns></returns> public static int Serialize(TrieNode rootNode, TrieIndexHeader trieIndexHeader, Stream index) { int processedNodeCount = 0; Queue <TrieNode> serializerQueue = new Queue <TrieNode>(); serializerQueue.Enqueue(rootNode); TrieNode currentNode = null; BinaryWriter binaryWriter = new BinaryWriter(index); while (serializerQueue.Count > 0) { currentNode = serializerQueue.Dequeue(); if (currentNode == null) { throw new InvalidDataException(string.Format("Value cannot be null ", processedNodeCount)); } long currentPositionOfStream = binaryWriter.BaseStream.Position; // write character //bw.Write(Encoding.Unicode.GetBytes(node.Character.ToString())); UInt16?characterIndex = TrieIndexHeaderCharacterReader.Instance.GetCharacterIndex(trieIndexHeader, currentNode.Character); if (characterIndex != null && characterIndex.HasValue) { binaryWriter.Write(characterIndex.Value); } else { binaryWriter.Write(Convert.ToUInt16(0)); // Its root } binaryWriter.Write(currentNode.IsTerminal); // write children flags // convert 512 bool value to 64 byte value for efficient storage BitArray baChildren = new BitArray(trieIndexHeader.COUNT_OF_CHARSET); if (currentNode.Children != null) { foreach (var item in currentNode.Children) { UInt16?itemIndex = TrieIndexHeaderCharacterReader.Instance.GetCharacterIndex(trieIndexHeader, item.Key); baChildren.Set(itemIndex.Value, true); } } int[] childrenFlags = new int[trieIndexHeader.COUNT_OF_CHILDREN_FLAGS_IN_BYTES]; BitArrayHelper.CopyToInt32Array(baChildren, childrenFlags, 0); for (int i = 0; i < childrenFlags.Length; i++) { binaryWriter.Write(childrenFlags[i]); } // write children offset binaryWriter.Write(currentNode.ChildrenCount * trieIndexHeader.LENGTH_OF_STRUCT); // todo:position of text file if (currentNode.PositionOnTextFile.HasValue) { binaryWriter.Write((uint)currentNode.PositionOnTextFile.Value); } else { binaryWriter.Write((uint)0); } if (currentNode.Children != null) { foreach (var childNode in currentNode.Children) { serializerQueue.Enqueue(childNode.Value); } } ++processedNodeCount; } return(processedNodeCount); }