public bool RemoveChild(DawgNode node) { if (!_childs.ContainsKey(node.Char)) { return(false); } _childs.Remove(node.Char); return(true); }
public DawgNode Next(char text) { DawgNode foundNode = null; if (_childs.TryGetValue(text, out foundNode)) { return(foundNode); } return(null); }
private static int GetDawgNodeId(DawgNode node) { var sb = new StringBuilder(); foreach (var node2 in node.DescendantsAndSelf()) { sb.Append(node2.Char); sb.Append(node2.Eow ? '1' : '0'); } return((int)FnvHash.GetHash(sb.ToString())); }
public bool AddChild(DawgNode node) { if (_childs.ContainsKey(node.Char)) { return(false); } if (node.Parent == null) { node.Parent = this; } _childs.Add(node.Char, node); return(true); }
public Dawg Decode(Stream stream) { using (var reader = new BinaryReader(stream)) { var fileVersion = reader.ReadSingle(); if (_version != fileVersion) { throw new ApplicationException(string.Format("The file version of dawg is not match.\rThe decoder version is {0},but file version is {1}", _version, fileVersion)); } var allNodes = new DawgNode[reader.ReadInt32()]; //read header of dawg file for (var i = 0; i < allNodes.Length; i++) { var text = (char)reader.ReadInt16(); var freq = reader.ReadInt32(); var depth = reader.ReadInt32(); var eow = reader.ReadBoolean(); var size = reader.ReadInt32(); var node = new DawgNode(text, size); node.Eow = eow; node.Depth = depth; node.Frequency = freq; allNodes[i] = node; } //build a dawg. var count = reader.ReadInt32(); var root = new DawgNode(new char(), count); for (var i = 0; i < count; i++) { var node = allNodes[reader.ReadInt32()]; root.AddChild(node); } count = reader.ReadInt32(); for (var i = 0; i < count; i++) { var label = reader.ReadInt32(); var node = allNodes[label]; var childNodeCount = reader.ReadInt32(); for (var j = 0; j < childNodeCount; j++) { var childNode = allNodes[reader.ReadInt32()]; node.AddChild(childNode); } } return(new Dawg(root)); } }
public Dawg Decode(Stream stream) { using (var reader = new BinaryReader(stream)) { var fileVersion = reader.ReadDouble(); if (_version != fileVersion) { throw new ApplicationException(string.Format("The file version of dawg is not match.\rThe decoder version is {0},but file version is {1}", _version, fileVersion)); } var allNodes = new DawgNode[reader.ReadInt32()]; //read header of dawg file for (var i = 0; i < allNodes.Length; i++) { var text = (char)reader.ReadInt16(); var freq = reader.ReadInt32(); var depth = reader.ReadInt32(); var eow = reader.ReadBoolean(); var size = reader.ReadInt32(); var node = new DawgNode(text, size); node.Eow = eow; node.Depth = depth; node.Frequency = freq; allNodes[i] = node; } //build a dawg. var count = reader.ReadInt32(); var root = new DawgNode(new char(), count); for (var i = 0; i < count; i++) { var node = allNodes[reader.ReadInt32()]; root.AddChild(node); } count = reader.ReadInt32(); for (var i = 0; i < count; i++) { var label = reader.ReadInt32(); var node = allNodes[label]; var childNodeCount = reader.ReadInt32(); for (var j = 0; j < childNodeCount; j++) { var childNode = allNodes[reader.ReadInt32()]; node.AddChild(childNode); } } return new Dawg(root); } }
private static IEnumerable<KeyValuePair<string, int>> IterateNodesString(string commonPrefix, DawgNode node) { if (node == null) yield break; foreach (var node2 in node.ChildNodes) { var nextCommonPrefix = commonPrefix + node2.Char; if (node2.Eow) { yield return new KeyValuePair<string, int>(nextCommonPrefix, node2.Frequency); } foreach(var matchWord in IterateNodesString(nextCommonPrefix,node2)) { yield return matchWord; } } }
public override Token Next() { var baseOffset = _reader.Position; var code = _reader.Peek(); if (code.IsNull()) { return(null); } DawgNode node = _dawg.Root.Next(code); //check char type before if (node == null || !node.HasChildNodes) { _reader.Seek(baseOffset); return(base.Next()); } var firstOfNodes = this.MatchedNodes(baseOffset); if (firstOfNodes.Count == 0) { _reader.Seek(baseOffset); return(base.Next()); } var maxLength = 0; var chunks = new List <Chunk>(3); for (var i = firstOfNodes.Count - 1; i >= 0; i--) { var offset1 = baseOffset + firstOfNodes[i].Depth + 1; var secondOfNodes = this.MatchedNodes(offset1); if (secondOfNodes.Count > 0) { for (var j = secondOfNodes.Count - 1; j >= 0; j--) { var offset2 = offset1 + secondOfNodes[j].Depth + 1; var thirdOfNodes = this.MatchedNodes(offset2); if (thirdOfNodes.Count > 0) { for (var k = thirdOfNodes.Count - 1; k >= 0; k--) { var offset3 = offset2 + thirdOfNodes[k].Depth + 1; var length = offset3 - baseOffset; //Rule 1: Maximum matching if (length >= maxLength) { maxLength = length; var chunk = new Chunk(length, new WordPoint[] { new WordPoint(baseOffset, offset1 - baseOffset, firstOfNodes[i].Frequency), new WordPoint(offset1, offset2 - offset1, secondOfNodes[j].Frequency), new WordPoint(offset2, offset3 - offset2, thirdOfNodes[k].Frequency) }); chunks.Add(chunk); } } } else { var length = offset2 - baseOffset; //Rule 1: Maximum matching if (length >= maxLength) { maxLength = length; var chunk = new Chunk(length, new WordPoint[] { new WordPoint(baseOffset, offset1 - baseOffset, firstOfNodes[i].Frequency), new WordPoint(offset1, offset2 - offset1, secondOfNodes[j].Frequency) }); chunks.Add(chunk); } } } } else { var length = offset1 - baseOffset; //Rule 1: Maximum matching if (length >= maxLength) { maxLength = length; var chunk = new Chunk(length, new WordPoint[] { new WordPoint(baseOffset, offset1 - baseOffset, firstOfNodes[i].Frequency) }); chunks.Add(chunk); } } } if (chunks.Count > 1) { var count = chunks.Count; foreach (var filter in Filters) { if ((count = filter.Apply(chunks, count)) == 1) { break; } } } //seek and read and move to next point to start. var bestChunk = chunks[0]; _reader.Seek(bestChunk[0].Offset); return(new Token(new string(_reader.Read(bestChunk[0].Length)), TokenType.CJK)); }
public bool AddChild(DawgNode node) { if (_childs.ContainsKey(node.Char)) { return false; } if (node.Parent == null) { node.Parent = this; } _childs.Add(node.Char, node); return true; }
public bool RemoveChild(DawgNode node) { if (!_childs.ContainsKey(node.Char)) { return false; } _childs.Remove(node.Char); return true; }
private static int GetDawgNodeId(DawgNode node) { var sb = new StringBuilder(); foreach (var node2 in node.DescendantsAndSelf()) { sb.Append(node2.Char); sb.Append(node2.Eow ? '1' : '0'); } return (int)FnvHash.GetHash(sb.ToString()); }
public Dawg Build(IEnumerable<KeyValuePair<string,int>> wordBag) { var root = new DawgNode(); var levelNodeCollections = new Dictionary<int, List<DawgNode>>(); foreach (var pair in wordBag) { var word = pair.Key; var nextNode = root; var level = 0; for (var i = 0; i < word.Length; i++, level++) { var ch = word[i]; var currNode = nextNode.Next(ch); if (currNode == null) { currNode = new DawgNode(ch) { Depth = level }; nextNode.AddChild(currNode); } //collection nodes with level List<DawgNode> nodes = null; if (!levelNodeCollections.TryGetValue(level, out nodes)) { nodes = new List<DawgNode>(); levelNodeCollections[level] = nodes; } nodes.Add(currNode); nextNode = currNode; } //make sure this node is EOW(end of word). nextNode.Eow = true; nextNode.Frequency = pair.Value; } //for fast to traverse all nodes,we should tracking a node branch which is been merged. var trackingNodes = new HashSet<DawgNode>(); for (var j = levelNodeCollections.Count - 1; j >= 0; j--) { var nextNode = root; var uniqNodeTables = new Dictionary<int, DawgNode>(); foreach (var node in levelNodeCollections[j]) { if (node.Eow || trackingNodes.Contains(node)) { DawgNode foundNode = null; var nodeId = GetDawgNodeId(node); if (uniqNodeTables.TryGetValue(nodeId, out foundNode)) { //merge two node that with has same value. if (node != foundNode) { node.Parent.RemoveChild(node); node.Parent.AddChild(foundNode); } foundNode.Eow |= node.Eow; //tracking merge node status trackingNodes.Add(node.Parent); trackingNodes.Add(foundNode.Parent); } else { uniqNodeTables[nodeId] = node; } } } } var dawg = new Dawg(root); return dawg; }
public Dawg Build(IEnumerable <KeyValuePair <string, int> > wordBag) { var root = new DawgNode(); var levelNodeCollections = new Dictionary <int, List <DawgNode> >(); foreach (var pair in wordBag) { var word = pair.Key; var nextNode = root; var level = 0; for (var i = 0; i < word.Length; i++, level++) { var ch = word[i]; var currNode = nextNode.Next(ch); if (currNode == null) { currNode = new DawgNode(ch) { Depth = level }; nextNode.AddChild(currNode); } //collection nodes with level List <DawgNode> nodes = null; if (!levelNodeCollections.TryGetValue(level, out nodes)) { nodes = new List <DawgNode>(); levelNodeCollections[level] = nodes; } nodes.Add(currNode); nextNode = currNode; } //make sure this node is EOW(end of word). nextNode.Eow = true; nextNode.Frequency = pair.Value; } //for fast to traverse all nodes,we should tracking a node branch which is been merged. var trackingNodes = new HashSet <DawgNode>(); for (var j = levelNodeCollections.Count - 1; j >= 0; j--) { var nextNode = root; var uniqNodeTables = new Dictionary <int, DawgNode>(); foreach (var node in levelNodeCollections[j]) { if (node.Eow || trackingNodes.Contains(node)) { DawgNode foundNode = null; var nodeId = GetDawgNodeId(node); if (uniqNodeTables.TryGetValue(nodeId, out foundNode)) { //merge two node that with has same value. if (node != foundNode) { node.Parent.RemoveChild(node); node.Parent.AddChild(foundNode); } foundNode.Eow |= node.Eow; //tracking merge node status trackingNodes.Add(node.Parent); trackingNodes.Add(foundNode.Parent); } else { uniqNodeTables[nodeId] = node; } } } } var dawg = new Dawg(root); return(dawg); }
public Dawg(DawgNode dawgTree) { _root = dawgTree; }
private static IEnumerable <KeyValuePair <string, int> > IterateNodesString(string commonPrefix, DawgNode node) { if (node == null) { yield break; } foreach (var node2 in node.ChildNodes) { var nextCommonPrefix = commonPrefix + node2.Char; if (node2.Eow) { yield return(new KeyValuePair <string, int>(nextCommonPrefix, node2.Frequency)); } foreach (var matchWord in IterateNodesString(nextCommonPrefix, node2)) { yield return(matchWord); } } }