/// <summary>
 /// Calculates a weighted mix of word costs (生起コスト) and path costs (遷移コスト) over a series of nodes.
 /// </summary>
 /// <typeparam name="TNode">The type of nodes.</typeparam>
 /// <param name="bundle">The dictionary bundle to get cost parameters from.</param>
 /// <param name="weight">A factor between 0.0f and 1.0f to determine the weights of word and path costs.</param>
 /// <param name="array">An array of nodes.</param>
 /// <returns>The sum of a mix of word costs and path costs.</returns>
 /// <remarks>
 /// Use <paramref name="weight"/> to control how much of word and path costs contribute to the cost.
 /// If it is 0.0f, the resulting cost equals to the word cost (<see cref="WordsCost{TNode}"/>).
 /// If it is 1.0f, the resulting cost equals to the path cost (<see cref="PathsCost{TNode}"/>).
 /// If it is in between, the resulting cost is a weighted mix of word and path costs.
 /// In particular, <paramref name="weight"/> of 0.5f gives exactly a half of the total cost
 /// (<see cref="TotalCost{TNode}"/>).
 /// <paramref name="weight"/> can be below 0 or above 1, though the returned value may be meaningless.
 /// </remarks>
 public static long MixedCost <TNode>(this DictionaryBundle <TNode> bundle, double weight, params TNode[] array)
     where TNode : MeCabNodeBase <TNode>
 {
     // Note that f * P + (1 - f) * W == f * (P + W) - (f * 2 - 1) * W,
     // and the right side is easier to calculate in this case.
     return((long)Math.Round(weight * TotalCost(bundle, array) - (weight * 2 - 1) * WordsCost(bundle, array)));
 }
Example #2
0
        /// <summary>
        /// Gets all tokens from a dictionary as isolated nodes.
        /// </summary>
        /// <typeparam name="TNode">The type of nodes that <paramref name="bundle"/> is for.</typeparam>
        /// <param name="bundle">The dictionary bundle containing <paramref name="dictionary"/>.</param>
        /// <param name="dictionary">The dictionary to get nodes from.</param>
        /// <returns>Iteration of nodes.</returns>
        /// <remarks>
        /// Iterating over all nodes may require some significant time, depending on the size of the dictionary.  Please be careful.
        /// </remarks>
        public static IEnumerable <TNode> GetNodes <TNode>(this DictionaryBundle <TNode> bundle, MeCabDictionary dictionary) where TNode : MeCabNodeBase <TNode>
        {
            var   address = GetSafeMemoryMappedViewAddress(dictionary);
            ulong token_table_starts;
            ulong token_table_ends;

            GetTokenTableLocations(address, out token_table_starts, out token_table_ends);
            for (ulong t = token_table_starts; t < token_table_ends; t += 16)
            {
                var node = bundle.NodeAllocator();
                LoadNodeData(t, node);
                node.Feature = GetFeature(t, bundle, dictionary);
                node.Stat    = MeCabNodeStat.Nor;
                yield return(node);
            }
        }
        /// <summary>
        /// Calculates the total cost of a series of nodes.
        /// </summary>
        /// <typeparam name="TNode">The type of nodes.</typeparam>
        /// <param name="bundle">The dictionary bundle to get cost parameters from.</param>
        /// <param name="array">An array of nodes.</param>
        /// <returns>The total cost of the array of nodes, i.e., the sum of word costs (生起コスト) and path costs (遷移コスト).</returns>
        /// <remarks>
        /// <para>
        /// This method uses nodes' <see cref="MeCabNodeSuperBase.WCost"/> but
        /// doesn't use <see cref="MeCabNodeBase{TNode}.LPath"/>, <see cref="MeCabNodeBase{TNode}.RPath"/>, or their <see cref="MeCabPath{TNode}.Cost"/>,
        /// so <paramref name="array"/> doesn't need to be from a parsed lattice.
        /// Path costs are looked anew up in <paramref name="bundle"/>
        /// (in particular in <see cref="DictionaryBundle{TNode}.Connector"/>).
        /// </para>
        /// <para>
        /// This method doesn't count the path costs from BOS to the first node and from the last node to EOS,
        /// so that it works better if applied to a sequence of nodes in a middle of a sentence.
        /// </para>
        /// </remarks>
        public static long TotalCost <TNode>(this DictionaryBundle <TNode> bundle, params TNode[] array)
            where TNode : MeCabNodeBase <TNode>
        {
            if (array.Length == 0)
            {
                return(0);
            }
            var   connector = bundle.Connector;
            TNode node, prev;

            node = prev = array[0];
            long cost = node.WCost;

            for (int i = 1; i < array.Length; i++)
            {
                node  = array[i];
                cost += connector.Cost(prev, node);
                prev  = node;
            }
            return(cost);
        }
Example #4
0
        /// <summary>
        /// Gets the header information of a MeCab dic file.
        /// </summary>
        /// <typeparam name="TNode">The type of nodes that <paramref name="bundle"/> is for.</typeparam>
        /// <param name="bundle">The dictionary bundle containing <paramref name="dictionary"/>.</param>
        /// <param name="dictionary">A dictionary object to get the header from.</param>
        /// <returns>A header.</returns>
        public unsafe static Header GetHeader <TNode>(this DictionaryBundle <TNode> bundle, MeCabDictionary dictionary) where TNode : MeCabNodeBase <TNode>
        {
            var h       = (byte *)GetSafeMemoryMappedViewAddress(dictionary);
            var charset = new byte[32];

            for (int i = 0; i < charset.Length; i++)
            {
                charset[i] = h[40 + i];
            }
            return(new Header()
            {
                Magic = *(uint *)(h + 0),
                Version = *(uint *)(h + 4),
                Type = *(uint *)(h + 8),
                LexSize = *(uint *)(h + 12),
                LSize = *(uint *)(h + 16),
                RSize = *(uint *)(h + 20),
                DSize = *(uint *)(h + 24),
                TSize = *(uint *)(h + 28),
                FSize = *(uint *)(h + 32),
                Dummy = *(uint *)(h + 36),
                Charset = charset,
            });
        }
 /// <summary>
 /// Calculates the sum of path costs (遷移コスト) between nodes.
 /// </summary>
 /// <typeparam name="TNode">The type of nodes.</typeparam>
 /// <param name="bundle">The dictionary bundle to get cost parameters from.</param>
 /// <param name="array">An array of nodes.</param>
 /// <returns>The sum of path costs.</returns>
 /// <remarks>
 /// This method doesn't use any cost information stored in or linked from nodes.
 /// Path costs are looked anew up in <paramref name="bundle"/>
 /// (in particular in <see cref="DictionaryBundle{TNode}.Connector"/>).
 /// </remarks>
 public static long PathsCost <TNode>(this DictionaryBundle <TNode> bundle, params TNode[] array)
     where TNode : MeCabNodeBase <TNode>
 {
     return(TotalCost(bundle, array) - WordsCost(bundle, array));
 }
 /// <summary>
 /// Calculates the sum of word costs (生起コスト) of nodes.
 /// </summary>
 /// <typeparam name="TNode">The type of nodes.</typeparam>
 /// <param name="bundle">Not used for calculation.</param>
 /// <param name="array">An array of nodes.</param>
 /// <returns>The sum of word costs.</returns>
 /// <remarks>
 /// This method uses nodes' <see cref="MeCabNodeSuperBase.WCost"/> to calculate the sum.
 /// <paramref name="bundle"/> parameter is actually not needed.
 /// </remarks>
 public static long WordsCost <TNode>(this DictionaryBundle <TNode> bundle, params TNode[] array)
     where TNode : MeCabNodeBase <TNode>
 {
     return(array.Sum(n => (long)n.WCost));
 }
 /// <summary>
 /// Calculate the mixed cost increase when a node would be added at the end of an existing open sequence.
 /// </summary>
 /// <typeparam name="TNode">Type of nodes.</typeparam>
 /// <param name="bundle">A bundle of dictionaries to get cost parameters form.</param>
 /// <param name="weight">A factor to determine the weights of word and path costs.</param>
 /// <param name="prev">The last node in the existing open sequence.</param>
 /// <param name="next">A node that would be added.</param>
 /// <returns>The mixed cost increase in <see cref="long"/> value.</returns>
 /// <remarks>
 /// This is a convenient method around <see cref="MixedCostIncrease{TNode}(DictionaryBundle{TNode}, double, TNode, TNode)"/>
 /// to return an integral value.
 /// Because a mixed cost is actually non-integral,
 /// The rounded value returned from this method may have some errors.
 /// </remarks>
 public static long MixedCostIncreaseRounded <TNode>(this DictionaryBundle <TNode> bundle, double weight, TNode prev, TNode next)
     where TNode : MeCabNodeBase <TNode>
 {
     return((long)Math.Round(MixedCostIncrease(bundle, weight, prev, next)));
 }
 /// <summary>
 /// Calculate the mixed cost increase when a node whould be added at the end of an existing open sequence.
 /// </summary>
 /// <typeparam name="TNode">Type of nodes.</typeparam>
 /// <param name="bundle">A bundle of dictionaries to get cost parameters from.</param>
 /// <param name="weight">A factor to determine the weights of word and path costs.</param>
 /// <param name="prev">The last node in the existing open sequence.</param>
 /// <param name="next">A node that would be added.</param>
 /// <returns>The mixed cost increase in <see cref="double"/> value.</returns>
 /// <remarks>
 /// Although <see cref="MixedCost{TNode}(DictionaryBundle{TNode}, double, TNode[])"/> returns a long value,
 /// the accurate mixed cost is not an integral value, because <paramref name="weight"/> can have any fractions.
 /// This methods returns the estimated increase in double to give more accurate estimation.
 /// </remarks>
 /// <seealso cref="MixedCostIncreaseRounded{TNode}(DictionaryBundle{TNode}, double, TNode, TNode)"/>
 public static double MixedCostIncrease <TNode>(this DictionaryBundle <TNode> bundle, double weight, TNode prev, TNode next)
     where TNode : MeCabNodeBase <TNode>
 {
     return(weight * bundle.Connector.Cost(prev, next) - (weight * 2 - 1) * next.WCost);
 }
 /// <summary>
 /// Calculates the total cost increase when a node would be added at the end of an existing open sequence.
 /// </summary>
 /// <typeparam name="TNode">Type of nodes.</typeparam>
 /// <param name="bundle">A bundle of dictionaries to get cost parameters from.</param>
 /// <param name="prev">The last node in the existing open sequence.</param>
 /// <param name="next">A node that would be added.</param>
 /// <returns>The total cost increase.</returns>
 public static long TotalCostIncrease <TNode>(this DictionaryBundle <TNode> bundle, TNode prev, TNode next)
     where TNode : MeCabNodeBase <TNode>
 {
     return(bundle.Connector.Cost(prev, next));
 }
Example #10
0
 /// <summary>
 /// Gets all tokens defined in all dictionaries as isolated nodes.
 /// </summary>
 /// <typeparam name="TNode">The type of nodes that <paramref name="bundle"/> is for.</typeparam>
 /// <param name="bundle">The dictionary bundle.</param>
 /// <returns>Iteration of nodes.</returns>
 /// <remarks>
 /// Iterating over all nodes usually requires some significant time.  Please be careful.
 /// </remarks>
 public static IEnumerable <TNode> GetAllNodes <TNode>(this DictionaryBundle <TNode> bundle) where TNode : MeCabNodeBase <TNode>
 {
     return(bundle.Dictionaries.SelectMany(dictionary => GetNodes(bundle, dictionary)));
 }
Example #11
0
        private unsafe static string GetFeature <TNode>(ulong address, DictionaryBundle <TNode> bundle, MeCabDictionary dic) where TNode : MeCabNodeBase <TNode>
        {
            var h = (byte *)address;

            return(StrUtils.GetString(dic.GetFeature(*(uint *)(h + 8)), bundle.Tokenizer.Encoding));
        }