示例#1
0
 /// <summary>
 /// Initialize a new instance of StandardTokenizer.
 /// </summary>
 /// <param name="file">The dawg format file.</param>
 public StandardTokenizer(string file)
 {
     using (var stream = Utils.LoadDawgFile(file))
     {
         var decoder = new DawgDecoder(Dawg.FILEVERSION);
         _dawg = decoder.Decode(stream);
     }
 }
示例#2
0
 /// <summary>
 /// Initialize a new instance of StandardTokenizer.
 /// </summary>
 /// <param name="file">The dawg format file.</param>
 public StandardTokenizer(string file)
 {
     using (var stream = Utils.LoadDawgFile(file))
     {
         var decoder = new DawgDecoder(Dawg.FILEVERSION);
         _dawg = decoder.Decode(stream);
     }
 }
示例#3
0
        public void Encode(Stream output, Dawg dawg)
        {
            //make a label for the each node.
            var count      = 0;
            var nodeLabels = new Dictionary <DawgNode, int>();

            foreach (var node in dawg.Root.Descendants())
            {
                if (nodeLabels.ContainsKey(node))
                {
                    continue;
                }
                nodeLabels[node] = count++;
            }
            var count2 = 0;
            var writer = new BinaryWriter(output, Encoding.UTF8);

            //the header of dawg.
            writer.Write(_version);
            writer.Write(count);
            foreach (var pair in nodeLabels)
            {
                var node = pair.Key;
                writer.Write((ushort)node.Char);
                writer.Write(node.Frequency);
                writer.Write(node.Depth);
                writer.Write(node.Eow);
                writer.Write(node.ChildNodes.Count);
                if (node.HasChildNodes)
                {
                    count2++;
                }
            }
            //the top node of dawg.
            writer.Write(dawg.Root.ChildNodes.Count);
            foreach (var node in dawg.Root.ChildNodes)
            {
                writer.Write(nodeLabels[node]);
            }
            //the child node of dawg.
            writer.Write(count2);
            foreach (var pair in nodeLabels)
            {
                var node = pair.Key;
                if (!node.HasChildNodes)
                {
                    continue;
                }
                writer.Write(pair.Value);
                writer.Write(node.ChildNodes.Count);
                foreach (var node2 in node.ChildNodes)
                {
                    writer.Write(nodeLabels[node2]);
                }
            }
        }
示例#4
0
 /// <summary>
 /// Initialize a new instance of StandardTokenizer.
 /// </summary>
 /// <param name="file">The dawg format file.</param>
 public StandardTokenizer(string file)
 {
     if (!File.Exists(file))
     {
         throw new FileNotFoundException("The file of dawg does not exist.", file);
     }
     using (var fs = new FileStream(file, FileMode.Open, FileAccess.Read))
     {
         var decoder = new DawgDecoder(Dawg.FILEVERSION);
         _dawg = decoder.Decode(fs);
     }
 }
 public MaximumMatchTokenBreaker(Dawg dawg, RewindStringReader reader)
     : base(reader)
 {
     _reader = reader;
     _dawg   = dawg;
 }
示例#6
0
 public void Encode(Stream output, Dawg dawg)
 {
     //make a label for the each node.
     var count = 0;
     var nodeLabels = new Dictionary<DawgNode, int>();
     foreach (var node in dawg.Root.Descendants())
     {
         if (nodeLabels.ContainsKey(node))
         {
             continue;
         }
         nodeLabels[node] = count++;
     }
     var count2 = 0;
     var writer = new BinaryWriter(output, Encoding.UTF8);
     //the header of dawg.
     writer.Write(_version);
     writer.Write(count);
     foreach (var pair in nodeLabels)
     {
         var node = pair.Key;
         writer.Write((ushort)node.Char);
         writer.Write(node.Frequency);
         writer.Write(node.Depth);
         writer.Write(node.Eow);
         writer.Write(node.ChildNodes.Count);
         if (node.HasChildNodes)
         {
             count2++;
         }
     }
     //the top node of dawg.
     writer.Write(dawg.Root.ChildNodes.Count);
     foreach (var node in dawg.Root.ChildNodes)
     {
         writer.Write(nodeLabels[node]);
     }
     //the child node of dawg.
     writer.Write(count2);
     foreach (var pair in nodeLabels)
     {
         var node = pair.Key;
         if (!node.HasChildNodes)
         {
             continue;
         }
         writer.Write(pair.Value);
         writer.Write(node.ChildNodes.Count);
         foreach (var node2 in node.ChildNodes)
         {
             writer.Write(nodeLabels[node2]);
         }
     }
 }
示例#7
0
        public Dawg Build(IEnumerable<KeyValuePair<string,int>> wordBag)
        {
            var root = new DawgNode();
            var levelNodeCollections = new Dictionary<int, List<DawgNode>>();
            foreach (var pair in wordBag)
            {
                var word = pair.Key;
                var nextNode = root;
                var level = 0;
                for (var i = 0; i < word.Length; i++, level++)
                {
                    var ch = word[i];
                    var currNode = nextNode.Next(ch);
                    if (currNode == null)
                    {
                        currNode = new DawgNode(ch) { Depth = level };
                        nextNode.AddChild(currNode);
                    }

                    //collection nodes with level
                    List<DawgNode> nodes = null;
                    if (!levelNodeCollections.TryGetValue(level, out nodes))
                    {
                        nodes = new List<DawgNode>();
                        levelNodeCollections[level] = nodes;
                    }
                    nodes.Add(currNode);
                    nextNode = currNode;
                }
                //make sure this node is EOW(end of word).
                nextNode.Eow = true;
                nextNode.Frequency = pair.Value;
            }
            //for fast to traverse all nodes,we should tracking a node branch which is been merged.
            var trackingNodes = new HashSet<DawgNode>();
            for (var j = levelNodeCollections.Count - 1; j >= 0; j--)
            {
                var nextNode = root;
                var uniqNodeTables = new Dictionary<int, DawgNode>();
                foreach (var node in levelNodeCollections[j])
                {
                    if (node.Eow || trackingNodes.Contains(node))
                    {
                        DawgNode foundNode = null;
                        var nodeId = GetDawgNodeId(node);
                        if (uniqNodeTables.TryGetValue(nodeId, out foundNode))
                        {
                            //merge two node that with has same value.
                            if (node != foundNode)
                            {
                                node.Parent.RemoveChild(node);
                                node.Parent.AddChild(foundNode);
                            }
                            foundNode.Eow |= node.Eow;
                            //tracking merge node status
                            trackingNodes.Add(node.Parent);
                            trackingNodes.Add(foundNode.Parent);
                        }
                        else
                        {
                            uniqNodeTables[nodeId] = node;
                        }
                    }
                }
            }
            var dawg = new Dawg(root);
            return dawg;
        }
示例#8
0
        /// <summary>
        /// Initialize a new instance of StandardTokenizer.
        /// </summary>
        /// <param name="src">The stream of the dawg file.</param>
        public StandardTokenizer(Stream src)
        {
            var decoder = new DawgDecoder(Dawg.FILEVERSION);

            _dawg = decoder.Decode(src);
        }
 public MaximumMatchTokenBreaker(Dawg dawg, RewindStringReader reader)
     : base(reader)
 {
     _reader = reader;
     _dawg = dawg;
 }
示例#10
0
        public Dawg Build(IEnumerable <KeyValuePair <string, int> > wordBag)
        {
            var root = new DawgNode();
            var levelNodeCollections = new Dictionary <int, List <DawgNode> >();

            foreach (var pair in wordBag)
            {
                var word     = pair.Key;
                var nextNode = root;
                var level    = 0;
                for (var i = 0; i < word.Length; i++, level++)
                {
                    var ch       = word[i];
                    var currNode = nextNode.Next(ch);
                    if (currNode == null)
                    {
                        currNode = new DawgNode(ch)
                        {
                            Depth = level
                        };
                        nextNode.AddChild(currNode);
                    }

                    //collection nodes with level
                    List <DawgNode> nodes = null;
                    if (!levelNodeCollections.TryGetValue(level, out nodes))
                    {
                        nodes = new List <DawgNode>();
                        levelNodeCollections[level] = nodes;
                    }
                    nodes.Add(currNode);
                    nextNode = currNode;
                }
                //make sure this node is EOW(end of word).
                nextNode.Eow       = true;
                nextNode.Frequency = pair.Value;
            }
            //for fast to traverse all nodes,we should tracking a node branch which is been merged.
            var trackingNodes = new HashSet <DawgNode>();

            for (var j = levelNodeCollections.Count - 1; j >= 0; j--)
            {
                var nextNode       = root;
                var uniqNodeTables = new Dictionary <int, DawgNode>();
                foreach (var node in levelNodeCollections[j])
                {
                    if (node.Eow || trackingNodes.Contains(node))
                    {
                        DawgNode foundNode = null;
                        var      nodeId    = GetDawgNodeId(node);
                        if (uniqNodeTables.TryGetValue(nodeId, out foundNode))
                        {
                            //merge two node that with has same value.
                            if (node != foundNode)
                            {
                                node.Parent.RemoveChild(node);
                                node.Parent.AddChild(foundNode);
                            }
                            foundNode.Eow |= node.Eow;
                            //tracking merge node status
                            trackingNodes.Add(node.Parent);
                            trackingNodes.Add(foundNode.Parent);
                        }
                        else
                        {
                            uniqNodeTables[nodeId] = node;
                        }
                    }
                }
            }
            var dawg = new Dawg(root);

            return(dawg);
        }