Example #1
0
        public IDictionary <long, SortedList <long, byte> > Parse(
            string collectionName, HttpRequest request, ReadSession readSession, SessionFactory sessionFactory)
        {
            string[] fields;
            var      docs = new Dictionary <long, SortedList <long, byte> >();

            if (request.Query.ContainsKey("fields"))
            {
                fields = request.Query["fields"].ToArray();
            }
            else
            {
                fields = new[] { "title", "body" };
            }

            var phrase = request.Query["q"];

            foreach (var field in fields)
            {
                var keyId  = sessionFactory.GetKeyId(collectionName.ToHash(), field.ToLower().ToHash());
                var vector = BOWWriteSession.CreateDocumentVector(phrase, readSession.CreateIndexReader(keyId), _tokenizer);

                docs.Add(keyId, vector);
            }

            return(docs);
        }
Example #2
0
        public void Warmup(IEnumerable <IDictionary> documents, params long[] excludeKeyIds)
        {
            foreach (var doc in documents)
            {
                foreach (var key in doc.Keys)
                {
                    var strKey = key.ToString();

                    if (!strKey.StartsWith("__"))
                    {
                        var keyId = SessionFactory.GetKeyId(CollectionId, strKey.ToHash());

                        if (excludeKeyIds.Contains(keyId))
                        {
                            continue;
                        }

                        var terms = _tokenizer.Tokenize(doc[key].ToString());

                        foreach (var token in terms.Tokens
                                 .Select(t => terms.Original.Substring(t.offset, t.length))
                                 .Where(s => !string.IsNullOrWhiteSpace(s)))
                        {
                            _httpQueue.Enqueue(token);
                        }
                    }
                }
            }
        }
Example #3
0
        public void Warmup(IEnumerable <IDictionary> documents, params long[] excludeKeyIds)
        {
            foreach (var doc in documents)
            {
                foreach (var key in doc.Keys)
                {
                    var strKey = key.ToString();

                    if (!strKey.StartsWith("__"))
                    {
                        var keyId = SessionFactory.GetKeyId(CollectionId, strKey.ToHash());

                        if (excludeKeyIds.Contains(keyId))
                        {
                            continue;
                        }

                        var terms = _tokenizer.Tokenize(doc[key].ToString());

                        foreach (var token in terms.Embeddings
                                 .Select(t => t.ToString()))
                        {
                            _httpQueue.Enqueue(token);
                        }
                    }
                }
            }
        }
Example #4
0
        private void Analyze(IDictionary doc, Dictionary <long, HashSet <string> > columns)
        {
            var docId = (ulong)doc["__docid"];

            foreach (var obj in doc.Keys)
            {
                var key = (string)obj;

                if (key.StartsWith("__"))
                {
                    continue;
                }

                var keyHash = key.ToHash();
                var keyId   = SessionFactory.GetKeyId(keyHash);

                HashSet <string> column;

                if (!columns.TryGetValue(keyId, out column))
                {
                    column = new HashSet <string>();
                    columns.Add(keyId, column);
                }

                var val = (IComparable)doc[obj];
                var str = val as string;

                if (str == null || key[0] == '_')
                {
                    var v = val.ToString();

                    if (!string.IsNullOrWhiteSpace(v))
                    {
                        column.Add(v);
                    }
                }
                else
                {
                    var tokens = _tokenizer.Tokenize(str);

                    foreach (var token in tokens)
                    {
                        column.Add(token);
                    }
                }
            }
        }
Example #5
0
        private void Write(IndexJob job)
        {
            try
            {
                var docCount = 0;
                var timer    = new Stopwatch();
                timer.Start();

                foreach (var doc in job.Documents)
                {
                    var docId = (ulong)doc["__docid"];

                    var keys = doc.Keys
                               .Cast <string>()
                               .Where(x => !x.StartsWith("__"));

                    foreach (var key in keys)
                    {
                        var        keyHash = key.ToHash();
                        var        keyId   = SessionFactory.GetKeyId(keyHash);
                        VectorNode ix;

                        if (!_dirty.TryGetValue(keyId, out ix))
                        {
                            ix = GetIndex(keyHash) ?? new VectorNode();
                            _dirty.Add(keyId, ix);
                        }

                        var val    = (IComparable)doc[key];
                        var str    = val as string;
                        var tokens = new HashSet <string>();

                        if (str == null || key[0] == '_')
                        {
                            tokens.Add(val.ToString());
                        }
                        else
                        {
                            var tokenlist = _tokenizer.Tokenize(str);

                            foreach (var token in tokenlist)
                            {
                                tokens.Add(token);
                            }
                        }

                        _buildQueue.Enqueue(new BuildJob(CollectionId, docId, tokens, ix));
                    }

                    if (++docCount == 100)
                    {
                        _log.Log(string.Format("analyzed doc {0}", doc["__docid"]));
                        docCount = 0;
                    }
                }

                _log.Log(string.Format("executed {0} analyze job in {1}",
                                       job.CollectionId, timer.Elapsed));
            }
            catch (Exception ex)
            {
                _log.Log(ex.ToString());

                throw;
            }
        }