示例#1
0
        public industryLemmaRankTable GetTable()
        {
            industryLemmaRankTable output = null;


            return(output);
        }
示例#2
0
        public industryLemmaRankTable process(webLemmaTermTable chunkTable, webLemmaTermTable termTable, industryLemmaRankTable output)
        {
            List <webLemmaTerm> allChunks = chunkTable.GetList();

            var docSetFreq = allChunks.Where(x => x.documentSetFrequency > 1);

            instanceCountCollection <String> termCounter = new instanceCountCollection <string>();

            aceDictionarySet <String, String> dict = new aceDictionarySet <string, string>();

            foreach (webLemmaTerm chunk in docSetFreq)
            {
                var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true);
                lemmas = lemmas.Where(x => x.Length > 2).ToList();
                termCounter.AddInstanceRange(lemmas);

                foreach (String lm in lemmas)
                {
                    foreach (String lmi in lemmas)
                    {
                        if (lmi != lm)
                        {
                            dict[lm].AddUnique(lmi);
                        }
                    }
                }
            }

            List <String> primaries = new List <string>();

            foreach (var pair in termCounter)
            {
                if (termCounter[pair] > 1)
                {
                    primaries.Add(pair);
                    industryLemmaTerm lemma = output.GetOrCreate(pair);
                    lemma.termType = industryLemmaTermType.primary;
                    lemma.weight   = settings.PrimaryTermFactor * termTable[lemma.name].weight;


                    lemma.nominalForm = pair;
                    output.AddOrUpdate(lemma);

                    if (dict.ContainsKey(lemma.nominalForm))
                    {
                        foreach (String secLemmas in dict[lemma.nominalForm])
                        {
                            industryLemmaTerm lemmaSec = output.GetOrCreate(secLemmas);
                            if (lemmaSec.termType == industryLemmaTermType.none)
                            {
                                lemmaSec.termType    = industryLemmaTermType.secondary;
                                lemmaSec.weight      = settings.SecondaryTermFactor * termTable[lemmaSec.name].weight;
                                lemmaSec.nominalForm = secLemmas;
                                output.AddOrUpdate(lemmaSec);
                            }
                        }
                    }
                }
            }

            //var reserveChunks = allChunks.Where(x => x.nominalForm.ContainsAny(primaries));

            //aceDictionarySet<String, String> dictReserve = new aceDictionarySet<string, string>();

            //foreach (webLemmaTerm chunk in reserveChunks)
            //{
            //    var lemmas = chunk.nominalForm.SplitSmart(textMapBase.SEPARATOR, "", true, true);
            //    lemmas = lemmas.Where(x => x.Length > 2).ToList();

            //    String prim = lemmas.FirstOrDefault(x => primaries.Contains(x));

            //    if (!prim.isNullOrEmpty())
            //    {
            //        foreach (String lm in lemmas)
            //        {
            //            if (prim != lm)
            //            {
            //                dictReserve[prim].AddUnique(lm);
            //            }
            //        }
            //    }

            //}

            //foreach (String prim in primaries)
            //{
            //    if (dictReserve.ContainsKey(prim))
            //    {
            //        foreach (String res in dictReserve[prim])
            //        {
            //            industryLemmaTerm resLemma = output.GetOrCreate(res);
            //            if (resLemma.termType == industryLemmaTermType.none)
            //            {

            //                resLemma.nominalForm = res;
            //                resLemma.weight = settings.ReserveTermFactor  *termTable[resLemma.name].weight;
            //                resLemma.termType = industryLemmaTermType.reserve;
            //            }
            //            output.AddOrUpdate(resLemma);
            //        }

            //    }
            //}

            return(output);
        }