Ejemplo n.º 1
0
 public bool RemoveChild(DawgNode node)
 {
     if (!_childs.ContainsKey(node.Char))
     {
         return(false);
     }
     _childs.Remove(node.Char);
     return(true);
 }
Ejemplo n.º 2
0
        public DawgNode Next(char text)
        {
            DawgNode foundNode = null;

            if (_childs.TryGetValue(text, out foundNode))
            {
                return(foundNode);
            }
            return(null);
        }
Ejemplo n.º 3
0
        private static int GetDawgNodeId(DawgNode node)
        {
            var sb = new StringBuilder();

            foreach (var node2 in node.DescendantsAndSelf())
            {
                sb.Append(node2.Char);
                sb.Append(node2.Eow ? '1' : '0');
            }
            return((int)FnvHash.GetHash(sb.ToString()));
        }
Ejemplo n.º 4
0
 public bool AddChild(DawgNode node)
 {
     if (_childs.ContainsKey(node.Char))
     {
         return(false);
     }
     if (node.Parent == null)
     {
         node.Parent = this;
     }
     _childs.Add(node.Char, node);
     return(true);
 }
Ejemplo n.º 5
0
        public Dawg Decode(Stream stream)
        {
            using (var reader = new BinaryReader(stream))
            {
                var fileVersion = reader.ReadSingle();
                if (_version != fileVersion)
                {
                    throw new ApplicationException(string.Format("The file version of dawg is not match.\rThe decoder version is {0},but file version is {1}", _version, fileVersion));
                }
                var allNodes = new DawgNode[reader.ReadInt32()];
                //read header of dawg file
                for (var i = 0; i < allNodes.Length; i++)
                {
                    var text  = (char)reader.ReadInt16();
                    var freq  = reader.ReadInt32();
                    var depth = reader.ReadInt32();
                    var eow   = reader.ReadBoolean();
                    var size  = reader.ReadInt32();
                    var node  = new DawgNode(text, size);
                    node.Eow       = eow;
                    node.Depth     = depth;
                    node.Frequency = freq;
                    allNodes[i]    = node;
                }
                //build a dawg.

                var count = reader.ReadInt32();
                var root  = new DawgNode(new char(), count);
                for (var i = 0; i < count; i++)
                {
                    var node = allNodes[reader.ReadInt32()];
                    root.AddChild(node);
                }
                count = reader.ReadInt32();
                for (var i = 0; i < count; i++)
                {
                    var label          = reader.ReadInt32();
                    var node           = allNodes[label];
                    var childNodeCount = reader.ReadInt32();
                    for (var j = 0; j < childNodeCount; j++)
                    {
                        var childNode = allNodes[reader.ReadInt32()];
                        node.AddChild(childNode);
                    }
                }
                return(new Dawg(root));
            }
        }
Ejemplo n.º 6
0
        public Dawg Decode(Stream stream)
        {
            using (var reader = new BinaryReader(stream))
            {
                var fileVersion = reader.ReadDouble();
                if (_version != fileVersion)
                {
                    throw new ApplicationException(string.Format("The file version of dawg is not match.\rThe decoder version is {0},but file version is {1}", _version, fileVersion));
                }
                var allNodes = new DawgNode[reader.ReadInt32()];
                //read header of dawg file
                for (var i = 0; i < allNodes.Length; i++)
                {
                    var text = (char)reader.ReadInt16();
                    var freq = reader.ReadInt32();
                    var depth = reader.ReadInt32();
                    var eow = reader.ReadBoolean();
                    var size = reader.ReadInt32();
                    var node = new DawgNode(text, size);
                    node.Eow = eow;
                    node.Depth = depth;
                    node.Frequency = freq;
                    allNodes[i] = node;
                }
                //build a dawg.

                var count = reader.ReadInt32();
                var root = new DawgNode(new char(), count);
                for (var i = 0; i < count; i++)
                {
                    var node = allNodes[reader.ReadInt32()];
                    root.AddChild(node);
                }
                count = reader.ReadInt32();
                for (var i = 0; i < count; i++)
                {
                    var label = reader.ReadInt32();
                    var node = allNodes[label];
                    var childNodeCount = reader.ReadInt32();
                    for (var j = 0; j < childNodeCount; j++)
                    {
                        var childNode = allNodes[reader.ReadInt32()];
                        node.AddChild(childNode);
                    }
                }
                return new Dawg(root);
            }
        }
Ejemplo n.º 7
0
 private static IEnumerable<KeyValuePair<string, int>> IterateNodesString(string commonPrefix, DawgNode node)
 {
     if (node == null) yield break;
     foreach (var node2 in node.ChildNodes)
     {
         var nextCommonPrefix = commonPrefix + node2.Char;
         if (node2.Eow)
         {
             yield return new KeyValuePair<string, int>(nextCommonPrefix, node2.Frequency);
         }
         foreach(var matchWord in IterateNodesString(nextCommonPrefix,node2))
         {
             yield return matchWord;
         }
     }
 }
Ejemplo n.º 8
0
        public override Token Next()
        {
            var baseOffset = _reader.Position;
            var code       = _reader.Peek();

            if (code.IsNull())
            {
                return(null);
            }
            DawgNode node = _dawg.Root.Next(code);

            //check char type before
            if (node == null || !node.HasChildNodes)
            {
                _reader.Seek(baseOffset);
                return(base.Next());
            }

            var firstOfNodes = this.MatchedNodes(baseOffset);

            if (firstOfNodes.Count == 0)
            {
                _reader.Seek(baseOffset);
                return(base.Next());
            }
            var maxLength = 0;
            var chunks    = new List <Chunk>(3);

            for (var i = firstOfNodes.Count - 1; i >= 0; i--)
            {
                var offset1       = baseOffset + firstOfNodes[i].Depth + 1;
                var secondOfNodes = this.MatchedNodes(offset1);
                if (secondOfNodes.Count > 0)
                {
                    for (var j = secondOfNodes.Count - 1; j >= 0; j--)
                    {
                        var offset2      = offset1 + secondOfNodes[j].Depth + 1;
                        var thirdOfNodes = this.MatchedNodes(offset2);
                        if (thirdOfNodes.Count > 0)
                        {
                            for (var k = thirdOfNodes.Count - 1; k >= 0; k--)
                            {
                                var offset3 = offset2 + thirdOfNodes[k].Depth + 1;
                                var length  = offset3 - baseOffset;
                                //Rule 1: Maximum matching
                                if (length >= maxLength)
                                {
                                    maxLength = length;
                                    var chunk = new Chunk(length,
                                                          new WordPoint[] { new WordPoint(baseOffset, offset1 - baseOffset, firstOfNodes[i].Frequency),
                                                                            new WordPoint(offset1, offset2 - offset1, secondOfNodes[j].Frequency),
                                                                            new WordPoint(offset2, offset3 - offset2, thirdOfNodes[k].Frequency) });
                                    chunks.Add(chunk);
                                }
                            }
                        }
                        else
                        {
                            var length = offset2 - baseOffset;
                            //Rule 1: Maximum matching
                            if (length >= maxLength)
                            {
                                maxLength = length;
                                var chunk = new Chunk(length, new WordPoint[] {
                                    new WordPoint(baseOffset, offset1 - baseOffset, firstOfNodes[i].Frequency),
                                    new WordPoint(offset1, offset2 - offset1, secondOfNodes[j].Frequency)
                                });
                                chunks.Add(chunk);
                            }
                        }
                    }
                }
                else
                {
                    var length = offset1 - baseOffset;
                    //Rule 1: Maximum matching
                    if (length >= maxLength)
                    {
                        maxLength = length;
                        var chunk = new Chunk(length, new WordPoint[] { new WordPoint(baseOffset, offset1 - baseOffset, firstOfNodes[i].Frequency) });
                        chunks.Add(chunk);
                    }
                }
            }
            if (chunks.Count > 1)
            {
                var count = chunks.Count;
                foreach (var filter in Filters)
                {
                    if ((count = filter.Apply(chunks, count)) == 1)
                    {
                        break;
                    }
                }
            }
            //seek and read and move to next point to start.
            var bestChunk = chunks[0];

            _reader.Seek(bestChunk[0].Offset);
            return(new Token(new string(_reader.Read(bestChunk[0].Length)), TokenType.CJK));
        }
Ejemplo n.º 9
0
 public bool AddChild(DawgNode node)
 {
     if (_childs.ContainsKey(node.Char))
     {
         return false;
     }
     if (node.Parent == null)
     {
         node.Parent = this;
     }
     _childs.Add(node.Char, node);
     return true;
 }
Ejemplo n.º 10
0
 public bool RemoveChild(DawgNode node)
 {
     if (!_childs.ContainsKey(node.Char))
     {
         return false;
     }
     _childs.Remove(node.Char);
     return true;
 }
Ejemplo n.º 11
0
 private static int GetDawgNodeId(DawgNode node)
 {
     var sb = new StringBuilder();
     foreach (var node2 in node.DescendantsAndSelf())
     {
         sb.Append(node2.Char);
         sb.Append(node2.Eow ? '1' : '0');
     }
     return (int)FnvHash.GetHash(sb.ToString());
 }
Ejemplo n.º 12
0
        public Dawg Build(IEnumerable<KeyValuePair<string,int>> wordBag)
        {
            var root = new DawgNode();
            var levelNodeCollections = new Dictionary<int, List<DawgNode>>();
            foreach (var pair in wordBag)
            {
                var word = pair.Key;
                var nextNode = root;
                var level = 0;
                for (var i = 0; i < word.Length; i++, level++)
                {
                    var ch = word[i];
                    var currNode = nextNode.Next(ch);
                    if (currNode == null)
                    {
                        currNode = new DawgNode(ch) { Depth = level };
                        nextNode.AddChild(currNode);
                    }

                    //collection nodes with level
                    List<DawgNode> nodes = null;
                    if (!levelNodeCollections.TryGetValue(level, out nodes))
                    {
                        nodes = new List<DawgNode>();
                        levelNodeCollections[level] = nodes;
                    }
                    nodes.Add(currNode);
                    nextNode = currNode;
                }
                //make sure this node is EOW(end of word).
                nextNode.Eow = true;
                nextNode.Frequency = pair.Value;
            }
            //for fast to traverse all nodes,we should tracking a node branch which is been merged.
            var trackingNodes = new HashSet<DawgNode>();
            for (var j = levelNodeCollections.Count - 1; j >= 0; j--)
            {
                var nextNode = root;
                var uniqNodeTables = new Dictionary<int, DawgNode>();
                foreach (var node in levelNodeCollections[j])
                {
                    if (node.Eow || trackingNodes.Contains(node))
                    {
                        DawgNode foundNode = null;
                        var nodeId = GetDawgNodeId(node);
                        if (uniqNodeTables.TryGetValue(nodeId, out foundNode))
                        {
                            //merge two node that with has same value.
                            if (node != foundNode)
                            {
                                node.Parent.RemoveChild(node);
                                node.Parent.AddChild(foundNode);
                            }
                            foundNode.Eow |= node.Eow;
                            //tracking merge node status
                            trackingNodes.Add(node.Parent);
                            trackingNodes.Add(foundNode.Parent);
                        }
                        else
                        {
                            uniqNodeTables[nodeId] = node;
                        }
                    }
                }
            }
            var dawg = new Dawg(root);
            return dawg;
        }
Ejemplo n.º 13
0
        public Dawg Build(IEnumerable <KeyValuePair <string, int> > wordBag)
        {
            var root = new DawgNode();
            var levelNodeCollections = new Dictionary <int, List <DawgNode> >();

            foreach (var pair in wordBag)
            {
                var word     = pair.Key;
                var nextNode = root;
                var level    = 0;
                for (var i = 0; i < word.Length; i++, level++)
                {
                    var ch       = word[i];
                    var currNode = nextNode.Next(ch);
                    if (currNode == null)
                    {
                        currNode = new DawgNode(ch)
                        {
                            Depth = level
                        };
                        nextNode.AddChild(currNode);
                    }

                    //collection nodes with level
                    List <DawgNode> nodes = null;
                    if (!levelNodeCollections.TryGetValue(level, out nodes))
                    {
                        nodes = new List <DawgNode>();
                        levelNodeCollections[level] = nodes;
                    }
                    nodes.Add(currNode);
                    nextNode = currNode;
                }
                //make sure this node is EOW(end of word).
                nextNode.Eow       = true;
                nextNode.Frequency = pair.Value;
            }
            //for fast to traverse all nodes,we should tracking a node branch which is been merged.
            var trackingNodes = new HashSet <DawgNode>();

            for (var j = levelNodeCollections.Count - 1; j >= 0; j--)
            {
                var nextNode       = root;
                var uniqNodeTables = new Dictionary <int, DawgNode>();
                foreach (var node in levelNodeCollections[j])
                {
                    if (node.Eow || trackingNodes.Contains(node))
                    {
                        DawgNode foundNode = null;
                        var      nodeId    = GetDawgNodeId(node);
                        if (uniqNodeTables.TryGetValue(nodeId, out foundNode))
                        {
                            //merge two node that with has same value.
                            if (node != foundNode)
                            {
                                node.Parent.RemoveChild(node);
                                node.Parent.AddChild(foundNode);
                            }
                            foundNode.Eow |= node.Eow;
                            //tracking merge node status
                            trackingNodes.Add(node.Parent);
                            trackingNodes.Add(foundNode.Parent);
                        }
                        else
                        {
                            uniqNodeTables[nodeId] = node;
                        }
                    }
                }
            }
            var dawg = new Dawg(root);

            return(dawg);
        }
Ejemplo n.º 14
0
 public Dawg(DawgNode dawgTree)
 {
     _root = dawgTree;
 }
Ejemplo n.º 15
0
 private static IEnumerable <KeyValuePair <string, int> > IterateNodesString(string commonPrefix, DawgNode node)
 {
     if (node == null)
     {
         yield break;
     }
     foreach (var node2 in node.ChildNodes)
     {
         var nextCommonPrefix = commonPrefix + node2.Char;
         if (node2.Eow)
         {
             yield return(new KeyValuePair <string, int>(nextCommonPrefix, node2.Frequency));
         }
         foreach (var matchWord in IterateNodesString(nextCommonPrefix, node2))
         {
             yield return(matchWord);
         }
     }
 }
Ejemplo n.º 16
0
 public Dawg(DawgNode dawgTree)
 {
     _root = dawgTree;
 }