Пример #1
0
        public NGramNode(bool isRootNode, MultipleFile multipleFile)
        {
            if (!isRootNode)
            {
                this._symbol = (TSymbol)Convert.ChangeType(multipleFile.ReadLine().Trim(), typeof(TSymbol));
            }

            var line  = multipleFile.ReadLine();
            var items = line.Split(" ");

            this._count               = int.Parse(items[0]);
            this._probability         = double.Parse(items[1]);
            this._probabilityOfUnseen = double.Parse(items[2]);
            var numberOfChildren = int.Parse(items[3]);

            if (numberOfChildren > 0)
            {
                _children = new Dictionary <TSymbol, NGramNode <TSymbol> >();
                for (var i = 0; i < numberOfChildren; i++)
                {
                    var childNode = new NGramNode <TSymbol>(false, multipleFile);
                    _children.Add(childNode._symbol, childNode);
                }
            }
        }
Пример #2
0
 /**
  * <summary>Constructor of {@link NGram} class which takes {@link Integer} size of ngram.</summary>
  *
  * <param name="n">size of ngram.</param>
  */
 public NGram(int n)
 {
     this._n                   = n;
     this._vocabulary          = new HashSet <TSymbol>();
     this._probabilityOfUnseen = new double[n];
     rootNode                  = new NGramNode <TSymbol>(default(TSymbol));
 }
Пример #3
0
        /**
         * <summary>Adds count times NGram given as array of symbols to the node as a child.</summary>
         *
         * <param name="sentence">     array of symbols</param>
         * <param name="index"> start index of NGram</param>
         * <param name="height">height for NGram. if height = 1, If level = 1, N-Gram is treated as UniGram, if level = 2,</param>
         *               N-Gram is treated as Bigram, etc.
         * <param name="count"> Number of times this NGram is added.</param>
         */
        public void AddNGram(TSymbol[] sentence, int index, int height, int count = 1)
        {
            NGramNode <TSymbol> child;

            if (height == 0)
            {
                return;
            }

            var s = sentence[index];

            if (_children != null && _children.ContainsKey(s))
            {
                child = _children[s];
            }
            else
            {
                child = new NGramNode <TSymbol>(s);
                if (_children == null)
                {
                    _children = new Dictionary <TSymbol, NGramNode <TSymbol> >();
                }

                _children.Add(s, child);
            }

            child._count += count;
            child.AddNGram(sentence, index + 1, height - 1, count);
        }
Пример #4
0
        public NGram(params string[] fileNameList)
        {
            var multipleFile = new MultipleFile(fileNameList);

            ReadHeader(multipleFile.GetStreamReader());
            rootNode = new NGramNode <TSymbol>(true, multipleFile);
            multipleFile.Close();
        }
Пример #5
0
        /**
         * <summary>Constructor of {@link NGram} class which takes filename to read from text file.</summary>
         *
         * <param name="fileName">name of the text file where NGram is saved.</param>
         */
        public NGram(string fileName)
        {
            var br = new StreamReader(fileName);

            ReadHeader(br);
            rootNode = new NGramNode <TSymbol>(true, br);
            br.Close();
        }
Пример #6
0
        /**
         * <summary>Constructor of {@link NGram} class which takes a {@link ArrayList} corpus and {@link Integer} size of ngram as input.
         * It adds all sentences of corpus as ngrams.</summary>
         *
         * <param name="corpus">{@link ArrayList} list of sentences whose ngrams are added.</param>
         * <param name="n">size of ngram.</param>
         */
        public NGram(List <List <TSymbol> > corpus, int n)
        {
            int i;

            this._n              = n;
            this._vocabulary     = new HashSet <TSymbol>();
            _probabilityOfUnseen = new double[n];
            rootNode             = new NGramNode <TSymbol>(default(TSymbol));
            for (i = 0; i < corpus.Count; i++)
            {
                AddNGramSentence(corpus[i].ToArray());
            }
        }
Пример #7
0
        /**
         * <summary>Replace words not in given dictionary.
         * Deletes unknown words from children nodes and adds them to {@link NGramNode#unknown} unknown node as children recursively.</summary>
         *
         * <param name="dictionary">dictionary of known words.</param>
         */
        public void ReplaceUnknownWords(HashSet <TSymbol> dictionary)
        {
            if (_children != null)
            {
                var childList = new List <NGramNode <TSymbol> >();
                foreach (var s in _children.Keys)
                {
                    if (!dictionary.Contains(s))
                    {
                        childList.Add(_children[s]);
                    }
                }

                if (childList.Count > 0)
                {
                    _unknown           = new NGramNode <TSymbol>(default(TSymbol));
                    _unknown._children = new Dictionary <TSymbol, NGramNode <TSymbol> >();
                    var sum = 0;
                    foreach (var child in childList)
                    {
                        if (child._children != null)
                        {
                            foreach (var(key, value) in child._children)
                            {
                                _unknown._children.Add(key, value);
                            }
                        }

                        sum += child._count;
                        _children.Remove(child._symbol);
                    }

                    _unknown._count = sum;
                    _unknown.ReplaceUnknownWords(dictionary);
                }

                foreach (var child in _children.Values)
                {
                    child.ReplaceUnknownWords(dictionary);
                }
            }
        }
Пример #8
0
 public void Merge(NGramNode <TSymbol> toBeMerged)
 {
     if (_children != null)
     {
         foreach (TSymbol symbol in _children.Keys)
         {
             if (toBeMerged._children.ContainsKey(symbol))
             {
                 _children[symbol].Merge(toBeMerged._children[symbol]);
             }
         }
         foreach (TSymbol symbol in toBeMerged._children.Keys)
         {
             if (!_children.ContainsKey(symbol))
             {
                 _children[symbol] = toBeMerged._children[symbol];
             }
         }
     }
     _count += toBeMerged.GetCount();
 }
Пример #9
0
        public void Prune(double threshold, int N)
        {
            if (N == 0)
            {
                TSymbol             maxElement = default;
                NGramNode <TSymbol> maxNode    = null;
                var toBeDeleted = new List <TSymbol>();
                foreach (var symbol in _children.Keys)
                {
                    if (_children[symbol]._count / (_count + 0.0) < threshold)
                    {
                        toBeDeleted.Add(symbol);
                    }

                    if (maxNode == null || _children[symbol]._count > _children[maxElement]._count)
                    {
                        maxElement = symbol;
                        maxNode    = _children[symbol];
                    }
                }

                foreach (var symbol in toBeDeleted)
                {
                    _children.Remove(symbol);
                }

                if (_children.Count == 0)
                {
                    _children[maxElement] = maxNode;
                }
            }
            else
            {
                foreach (var node in _children.Values)
                {
                    node.Prune(threshold, N - 1);
                }
            }
        }