/// <summary> /// Initialize a new instance of StandardTokenizer. /// </summary> /// <param name="file">The dawg format file.</param> public StandardTokenizer(string file) { using (var stream = Utils.LoadDawgFile(file)) { var decoder = new DawgDecoder(Dawg.FILEVERSION); _dawg = decoder.Decode(stream); } }
public void Encode(Stream output, Dawg dawg) { //make a label for the each node. var count = 0; var nodeLabels = new Dictionary <DawgNode, int>(); foreach (var node in dawg.Root.Descendants()) { if (nodeLabels.ContainsKey(node)) { continue; } nodeLabels[node] = count++; } var count2 = 0; var writer = new BinaryWriter(output, Encoding.UTF8); //the header of dawg. writer.Write(_version); writer.Write(count); foreach (var pair in nodeLabels) { var node = pair.Key; writer.Write((ushort)node.Char); writer.Write(node.Frequency); writer.Write(node.Depth); writer.Write(node.Eow); writer.Write(node.ChildNodes.Count); if (node.HasChildNodes) { count2++; } } //the top node of dawg. writer.Write(dawg.Root.ChildNodes.Count); foreach (var node in dawg.Root.ChildNodes) { writer.Write(nodeLabels[node]); } //the child node of dawg. writer.Write(count2); foreach (var pair in nodeLabels) { var node = pair.Key; if (!node.HasChildNodes) { continue; } writer.Write(pair.Value); writer.Write(node.ChildNodes.Count); foreach (var node2 in node.ChildNodes) { writer.Write(nodeLabels[node2]); } } }
/// <summary> /// Initialize a new instance of StandardTokenizer. /// </summary> /// <param name="file">The dawg format file.</param> public StandardTokenizer(string file) { if (!File.Exists(file)) { throw new FileNotFoundException("The file of dawg does not exist.", file); } using (var fs = new FileStream(file, FileMode.Open, FileAccess.Read)) { var decoder = new DawgDecoder(Dawg.FILEVERSION); _dawg = decoder.Decode(fs); } }
public MaximumMatchTokenBreaker(Dawg dawg, RewindStringReader reader) : base(reader) { _reader = reader; _dawg = dawg; }
public void Encode(Stream output, Dawg dawg) { //make a label for the each node. var count = 0; var nodeLabels = new Dictionary<DawgNode, int>(); foreach (var node in dawg.Root.Descendants()) { if (nodeLabels.ContainsKey(node)) { continue; } nodeLabels[node] = count++; } var count2 = 0; var writer = new BinaryWriter(output, Encoding.UTF8); //the header of dawg. writer.Write(_version); writer.Write(count); foreach (var pair in nodeLabels) { var node = pair.Key; writer.Write((ushort)node.Char); writer.Write(node.Frequency); writer.Write(node.Depth); writer.Write(node.Eow); writer.Write(node.ChildNodes.Count); if (node.HasChildNodes) { count2++; } } //the top node of dawg. writer.Write(dawg.Root.ChildNodes.Count); foreach (var node in dawg.Root.ChildNodes) { writer.Write(nodeLabels[node]); } //the child node of dawg. writer.Write(count2); foreach (var pair in nodeLabels) { var node = pair.Key; if (!node.HasChildNodes) { continue; } writer.Write(pair.Value); writer.Write(node.ChildNodes.Count); foreach (var node2 in node.ChildNodes) { writer.Write(nodeLabels[node2]); } } }
public Dawg Build(IEnumerable<KeyValuePair<string,int>> wordBag) { var root = new DawgNode(); var levelNodeCollections = new Dictionary<int, List<DawgNode>>(); foreach (var pair in wordBag) { var word = pair.Key; var nextNode = root; var level = 0; for (var i = 0; i < word.Length; i++, level++) { var ch = word[i]; var currNode = nextNode.Next(ch); if (currNode == null) { currNode = new DawgNode(ch) { Depth = level }; nextNode.AddChild(currNode); } //collection nodes with level List<DawgNode> nodes = null; if (!levelNodeCollections.TryGetValue(level, out nodes)) { nodes = new List<DawgNode>(); levelNodeCollections[level] = nodes; } nodes.Add(currNode); nextNode = currNode; } //make sure this node is EOW(end of word). nextNode.Eow = true; nextNode.Frequency = pair.Value; } //for fast to traverse all nodes,we should tracking a node branch which is been merged. var trackingNodes = new HashSet<DawgNode>(); for (var j = levelNodeCollections.Count - 1; j >= 0; j--) { var nextNode = root; var uniqNodeTables = new Dictionary<int, DawgNode>(); foreach (var node in levelNodeCollections[j]) { if (node.Eow || trackingNodes.Contains(node)) { DawgNode foundNode = null; var nodeId = GetDawgNodeId(node); if (uniqNodeTables.TryGetValue(nodeId, out foundNode)) { //merge two node that with has same value. if (node != foundNode) { node.Parent.RemoveChild(node); node.Parent.AddChild(foundNode); } foundNode.Eow |= node.Eow; //tracking merge node status trackingNodes.Add(node.Parent); trackingNodes.Add(foundNode.Parent); } else { uniqNodeTables[nodeId] = node; } } } } var dawg = new Dawg(root); return dawg; }
/// <summary> /// Initialize a new instance of StandardTokenizer. /// </summary> /// <param name="src">The stream of the dawg file.</param> public StandardTokenizer(Stream src) { var decoder = new DawgDecoder(Dawg.FILEVERSION); _dawg = decoder.Decode(src); }
public Dawg Build(IEnumerable <KeyValuePair <string, int> > wordBag) { var root = new DawgNode(); var levelNodeCollections = new Dictionary <int, List <DawgNode> >(); foreach (var pair in wordBag) { var word = pair.Key; var nextNode = root; var level = 0; for (var i = 0; i < word.Length; i++, level++) { var ch = word[i]; var currNode = nextNode.Next(ch); if (currNode == null) { currNode = new DawgNode(ch) { Depth = level }; nextNode.AddChild(currNode); } //collection nodes with level List <DawgNode> nodes = null; if (!levelNodeCollections.TryGetValue(level, out nodes)) { nodes = new List <DawgNode>(); levelNodeCollections[level] = nodes; } nodes.Add(currNode); nextNode = currNode; } //make sure this node is EOW(end of word). nextNode.Eow = true; nextNode.Frequency = pair.Value; } //for fast to traverse all nodes,we should tracking a node branch which is been merged. var trackingNodes = new HashSet <DawgNode>(); for (var j = levelNodeCollections.Count - 1; j >= 0; j--) { var nextNode = root; var uniqNodeTables = new Dictionary <int, DawgNode>(); foreach (var node in levelNodeCollections[j]) { if (node.Eow || trackingNodes.Contains(node)) { DawgNode foundNode = null; var nodeId = GetDawgNodeId(node); if (uniqNodeTables.TryGetValue(nodeId, out foundNode)) { //merge two node that with has same value. if (node != foundNode) { node.Parent.RemoveChild(node); node.Parent.AddChild(foundNode); } foundNode.Eow |= node.Eow; //tracking merge node status trackingNodes.Add(node.Parent); trackingNodes.Add(foundNode.Parent); } else { uniqNodeTables[nodeId] = node; } } } } var dawg = new Dawg(root); return(dawg); }