public NGramNode(bool isRootNode, MultipleFile multipleFile) { if (!isRootNode) { this._symbol = (TSymbol)Convert.ChangeType(multipleFile.ReadLine().Trim(), typeof(TSymbol)); } var line = multipleFile.ReadLine(); var items = line.Split(" "); this._count = int.Parse(items[0]); this._probability = double.Parse(items[1]); this._probabilityOfUnseen = double.Parse(items[2]); var numberOfChildren = int.Parse(items[3]); if (numberOfChildren > 0) { _children = new Dictionary <TSymbol, NGramNode <TSymbol> >(); for (var i = 0; i < numberOfChildren; i++) { var childNode = new NGramNode <TSymbol>(false, multipleFile); _children.Add(childNode._symbol, childNode); } } }
/** * <summary>Constructor of {@link NGram} class which takes {@link Integer} size of ngram.</summary> * * <param name="n">size of ngram.</param> */ public NGram(int n) { this._n = n; this._vocabulary = new HashSet <TSymbol>(); this._probabilityOfUnseen = new double[n]; rootNode = new NGramNode <TSymbol>(default(TSymbol)); }
/** * <summary>Adds count times NGram given as array of symbols to the node as a child.</summary> * * <param name="sentence"> array of symbols</param> * <param name="index"> start index of NGram</param> * <param name="height">height for NGram. if height = 1, If level = 1, N-Gram is treated as UniGram, if level = 2,</param> * N-Gram is treated as Bigram, etc. * <param name="count"> Number of times this NGram is added.</param> */ public void AddNGram(TSymbol[] sentence, int index, int height, int count = 1) { NGramNode <TSymbol> child; if (height == 0) { return; } var s = sentence[index]; if (_children != null && _children.ContainsKey(s)) { child = _children[s]; } else { child = new NGramNode <TSymbol>(s); if (_children == null) { _children = new Dictionary <TSymbol, NGramNode <TSymbol> >(); } _children.Add(s, child); } child._count += count; child.AddNGram(sentence, index + 1, height - 1, count); }
public NGram(params string[] fileNameList) { var multipleFile = new MultipleFile(fileNameList); ReadHeader(multipleFile.GetStreamReader()); rootNode = new NGramNode <TSymbol>(true, multipleFile); multipleFile.Close(); }
/** * <summary>Constructor of {@link NGram} class which takes filename to read from text file.</summary> * * <param name="fileName">name of the text file where NGram is saved.</param> */ public NGram(string fileName) { var br = new StreamReader(fileName); ReadHeader(br); rootNode = new NGramNode <TSymbol>(true, br); br.Close(); }
/** * <summary>Constructor of {@link NGram} class which takes a {@link ArrayList} corpus and {@link Integer} size of ngram as input. * It adds all sentences of corpus as ngrams.</summary> * * <param name="corpus">{@link ArrayList} list of sentences whose ngrams are added.</param> * <param name="n">size of ngram.</param> */ public NGram(List <List <TSymbol> > corpus, int n) { int i; this._n = n; this._vocabulary = new HashSet <TSymbol>(); _probabilityOfUnseen = new double[n]; rootNode = new NGramNode <TSymbol>(default(TSymbol)); for (i = 0; i < corpus.Count; i++) { AddNGramSentence(corpus[i].ToArray()); } }
/** * <summary>Replace words not in given dictionary. * Deletes unknown words from children nodes and adds them to {@link NGramNode#unknown} unknown node as children recursively.</summary> * * <param name="dictionary">dictionary of known words.</param> */ public void ReplaceUnknownWords(HashSet <TSymbol> dictionary) { if (_children != null) { var childList = new List <NGramNode <TSymbol> >(); foreach (var s in _children.Keys) { if (!dictionary.Contains(s)) { childList.Add(_children[s]); } } if (childList.Count > 0) { _unknown = new NGramNode <TSymbol>(default(TSymbol)); _unknown._children = new Dictionary <TSymbol, NGramNode <TSymbol> >(); var sum = 0; foreach (var child in childList) { if (child._children != null) { foreach (var(key, value) in child._children) { _unknown._children.Add(key, value); } } sum += child._count; _children.Remove(child._symbol); } _unknown._count = sum; _unknown.ReplaceUnknownWords(dictionary); } foreach (var child in _children.Values) { child.ReplaceUnknownWords(dictionary); } } }
public void Merge(NGramNode <TSymbol> toBeMerged) { if (_children != null) { foreach (TSymbol symbol in _children.Keys) { if (toBeMerged._children.ContainsKey(symbol)) { _children[symbol].Merge(toBeMerged._children[symbol]); } } foreach (TSymbol symbol in toBeMerged._children.Keys) { if (!_children.ContainsKey(symbol)) { _children[symbol] = toBeMerged._children[symbol]; } } } _count += toBeMerged.GetCount(); }
public void Prune(double threshold, int N) { if (N == 0) { TSymbol maxElement = default; NGramNode <TSymbol> maxNode = null; var toBeDeleted = new List <TSymbol>(); foreach (var symbol in _children.Keys) { if (_children[symbol]._count / (_count + 0.0) < threshold) { toBeDeleted.Add(symbol); } if (maxNode == null || _children[symbol]._count > _children[maxElement]._count) { maxElement = symbol; maxNode = _children[symbol]; } } foreach (var symbol in toBeDeleted) { _children.Remove(symbol); } if (_children.Count == 0) { _children[maxElement] = maxNode; } } else { foreach (var node in _children.Values) { node.Prune(threshold, N - 1); } } }