예제 #1
0
 /// <summary>
 /// Perform a search using the given query.
 /// </summary>
 public List<SearchResultItem> executeQuery(Query query, WeightingScheme scheme)
 {
     // Find the documents that contain terms that exists in the query.
     var results = new Dictionary<int,float>();
     foreach (var term in query.terms)
     {
         float idf = scheme.computeIDF((float)documents.getDocuments().Count, (float)term.term.getOccurences().Count);
         foreach (var occurence in term.term.getOccurences())
         {
             float dl = (float)documents.documentById(occurence.documentId).length;
             float termSimilarity = scheme.computeTermSimilarity((float)occurence.frequency, idf, (float)term.frequency, dl, documents.averageDocumentLength);
             float similarity;
             if (results.TryGetValue(occurence.documentId, out similarity))
             {
                 results[occurence.documentId] = similarity + termSimilarity;
             }
             else
             {
                 results[occurence.documentId] = termSimilarity;
             }
         }
     }
     // Create an ordered list of resulting documents and their similarity scores.
     var resultList = new List<SearchResultItem>();
     foreach (var i in results)
     {
         resultList.Add(new SearchResultItem(i.Key, i.Value));
     }
     resultList.Sort((x, y) => -x.similarity.CompareTo(y.similarity));
     return resultList;
 }
예제 #2
0
        public Vec SingleSentence2Vec(string sentence, WeightingScheme weightingScheme = WeightingScheme.AVG)
        {
            Vec        sentenceVector     = new Vec();
            List <Vec> sentenceVectorList = new List <Vec>();

            string[] words = sentence.Split(' ');
            foreach (string word in words)
            {
                Vec vec = Word2Vec(word.ToLower());
                sentenceVectorList.Add(vec);
            }
            if (weightingScheme == WeightingScheme.AVG)
            {
                int    dim = sentenceVectorList[0].VecNodes.Count;
                double nodeTotalValue;
                for (int k = 0; k < dim; k++)
                {
                    nodeTotalValue = 0;
                    for (int j = 0; j < sentenceVectorList.Count; j++)
                    {
                        Vec    curWordVec = sentenceVectorList[j];
                        double curNodeVal = curWordVec.VecNodes[k];
                        nodeTotalValue += curNodeVal;
                    }
                    sentenceVector.VecNodes.Add(nodeTotalValue / dim);
                }
            }
            return(sentenceVector);
        }
예제 #3
0
        public List <Vec> Sentence2Vec(List <string> sentences, WeightingScheme weightingScheme = WeightingScheme.AVG)
        {
            // Inplementing TF-IDF
            // TFIDFGenerator tfidfGenerator = new TFIDFGenerator();
            List <List <double> > weights = null;// tfidfGenerator.TFIDFWeightVectorsForSentences(sentences.ToArray());

            List <List <Vec> > matixList          = new List <List <Vec> >();
            List <Vec>         sentenceVectorList = new List <Vec>();

            sentences.ForEach(sentence => {
                //List<Vec> sentenceVectorList = new List<Vec>();
                //string[] words = sentence.Split(' ');
                //foreach (string word in words)
                //{
                //    Vec vec = Word2Vec(word.ToLower());
                //    sentenceVectorList.Add(vec);
                //}
                //matixList.Add(sentenceVectorList);
            });

            return(sentenceVectorList);

            /*
             * List<Vec> vectorList = new List<Vec>();
             * // Traverse each sentence
             * for (int i = 0; i < sentences.Count; i++)
             * {
             *  Vec sentenceVector = null;
             *  List<Vec> curVecList = matixList[i];
             *  if (weightingScheme == WeightingScheme.TFIDF)
             *  {
             *      // Get this sentence
             *      List<double> weight = weights[i];
             *      sentenceVector = TFIDFMultiply(curVecList, weight);
             *  }
             *  if (weightingScheme == WeightingScheme.AVG)
             *  {
             *      int dim = curVecList[0].VecNodes.Count;
             *      sentenceVector = new Vec();
             *      double nodeTotalValue;
             *      for (int k = 0; k < dim; k++)
             *      {
             *          nodeTotalValue = 0;
             *          for (int j = 0; j < curVecList.Count; j++)
             *          {
             *              Vec curWordVec = curVecList[j];
             *              double curNodeVal = curWordVec.VecNodes[k];
             *              nodeTotalValue += curNodeVal;
             *          }
             *          sentenceVector.VecNodes.Add(nodeTotalValue / dim);
             *
             *      }
             *
             *  }
             *  vectorList.Add(sentenceVector);
             * }
             * for (int i = 0; i < vectorList.Count; i++)
             * {
             *  if (this.dict.ContainsKey(sentences[i]))
             *  {
             *      continue;
             *  }
             *  else
             *  {
             *      this.dict.Add(sentences[i], vectorList[i]);
             *  }
             *
             * }
             * return vectorList;
             */
        }
예제 #4
0
        public List <Vec> Sentence2Vec(List <string> sentences, WeightingScheme weightingScheme = WeightingScheme.AVG)
        {
            // Inplementing TF-IDF
            // TFIDFGenerator tfidfGenerator = new TFIDFGenerator();
            List <List <double> > weights = null;// tfidfGenerator.TFIDFWeightVectorsForSentences(sentences.ToArray());

            List <List <Vec> > matixList = new List <List <Vec> >();

            sentences.ForEach(sentence => {
                List <Vec> sentenceVectorList = new List <Vec>();
                string[] words = sentence.Split(' ');
                foreach (string word in words)
                {
                    Vec vec = Word2Vec(word.ToLower());
                    sentenceVectorList.Add(vec);
                }
                matixList.Add(sentenceVectorList);
            });

            List <Vec> vectorList = new List <Vec>();

            // Traverse each sentence
            for (int i = 0; i < sentences.Count; i++)
            {
                Vec        sentenceVector = null;
                List <Vec> curVecList     = matixList[i];
                if (weightingScheme == WeightingScheme.TFIDF)
                {
                    // Get this sentence
                    List <double> weight = weights[i];
                    sentenceVector = TFIDFMultiply(curVecList, weight);
                }
                if (weightingScheme == WeightingScheme.AVG)
                {
                    int dim = curVecList[0].VecNodes.Count;
                    sentenceVector = new Vec();
                    double nodeTotalValue;
                    for (int k = 0; k < dim; k++)
                    {
                        nodeTotalValue = 0;
                        for (int j = 0; j < curVecList.Count; j++)
                        {
                            Vec    curWordVec = curVecList[j];
                            double curNodeVal = curWordVec.VecNodes[k];
                            nodeTotalValue += curNodeVal;
                        }
                        sentenceVector.VecNodes.Add(nodeTotalValue / dim);
                    }
                }
                vectorList.Add(sentenceVector);
            }
            for (int i = 0; i < vectorList.Count; i++)
            {
                if (this.dict.ContainsKey(sentences[i]))
                {
                    continue;
                }
                else
                {
                    this.dict.Add(sentences[i], vectorList[i]);
                }
            }
            return(vectorList);
        }