前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >lucene.net全文检索(二)lucene.net 的封装

lucene.net全文检索(二)lucene.net 的封装

作者头像
明志德道
发布2023-10-21 17:46:37
2670
发布2023-10-21 17:46:37
举报

查询

代码语言:javascript
复制
   public class LuceneQuery : ILuceneQuery
    {
        #region Identity
        private Logger logger = new Logger(typeof(LuceneQuery));
        #endregion Identity

        #region QueryIndex
        /// <summary>
        /// 获取商品信息数据
        /// </summary>
        /// <param name="queryString"></param>
        /// <returns></returns>
        public List<Commodity> QueryIndex(string queryString)
        {
            IndexSearcher searcher = null;
            try
            {
                List<Commodity> ciList = new List<Commodity>();
                Directory dir = FSDirectory.Open(StaticConstant.IndexPath);
                searcher = new IndexSearcher(dir);
                Analyzer analyzer = new PanGuAnalyzer();

                //--------------------------------------这里配置搜索条件
                QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer);
                Query query = parser.Parse(queryString);
                Console.WriteLine(query.ToString()); //显示搜索表达式
                TopDocs docs = searcher.Search(query, (Filter)null, 10000);

                foreach (ScoreDoc sd in docs.ScoreDocs)
                {
                    Document doc = searcher.Doc(sd.Doc);
                    ciList.Add(DocumentToCommodityInfo(doc));
                }

                return ciList;
            }
            finally
            {
                if (searcher != null)
                {
                    searcher.Dispose();
                }
            }
        }



        /// <summary>
        /// 分页获取商品信息数据
        /// </summary>
        /// <param name="queryString"></param>
        /// <param name="pageIndex">第一页为1</param>
        /// <param name="pageSize"></param>
        /// <param name="totalCount"></param>
        /// <returns></returns>
        public List<Commodity> QueryIndexPage(string queryString, int pageIndex, int pageSize, out int totalCount, string priceFilter, string priceOrderBy)
        {
            totalCount = 0;
            IndexSearcher searcher = null;
            try
            {
                List<Commodity> ciList = new List<Commodity>();
                FSDirectory dir = FSDirectory.Open(StaticConstant.IndexPath);
                searcher = new IndexSearcher(dir);
                Analyzer analyzer = new PanGuAnalyzer();

                //--------------------------------------这里配置搜索条件
                QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer);
                Query query = parser.Parse(queryString);
                
                pageIndex = Math.Max(1, pageIndex);//索引从1开始
                int startIndex = (pageIndex - 1) * pageSize;
                int endIndex = pageIndex * pageSize;

                NumericRangeFilter<float> numPriceFilter = null;
                if (!string.IsNullOrWhiteSpace(priceFilter))
                {
                    bool isContainStart = priceFilter.StartsWith("[");
                    bool isContainEnd = priceFilter.EndsWith("]");
                    string[] floatArray = priceFilter.Replace("[", "").Replace("]", "").Replace("{", "").Replace("}", "").Split(',');
                    float start = 0;
                    float end = 0;
                    if (!float.TryParse(floatArray[0], out start) || !float.TryParse(floatArray[1], out end))
                    {
                        throw new Exception("Wrong priceFilter");
                    }
                    numPriceFilter = NumericRangeFilter.NewFloatRange("price", start, end, isContainStart, isContainEnd);
                }

                Sort sort = new Sort();
                if (!string.IsNullOrWhiteSpace(priceOrderBy))
                {
                    SortField sortField = new SortField("price", SortField.FLOAT, priceOrderBy.EndsWith("asc", StringComparison.CurrentCultureIgnoreCase));
                    sort.SetSort(sortField);
                }

                TopDocs docs = searcher.Search(query, numPriceFilter, 10000, sort);
                //TopDocs docs = searcher.Search(query, null, 10000);
                
                totalCount = docs.TotalHits;
                //PrintScores(docs, startIndex, endIndex, searcher);
                for (int i = startIndex; i < endIndex && i < totalCount; i++)
                {
                    Document doc = searcher.Doc(docs.ScoreDocs[i].Doc);
                    ciList.Add(DocumentToCommodityInfo(doc));
                }

                return ciList;
            }
            finally
            {
                if (searcher != null)
                {
                    searcher.Dispose();
                }
            }
        }

        private void PrintScores(TopDocs docs, int startIndex, int endIndex, MultiSearcher searcher)
        {
            ScoreDoc[] scoreDocs = docs.ScoreDocs;
            for (int i = startIndex; i < endIndex && i < scoreDocs.Count(); i++)
            {
                int docId = scoreDocs[i].Doc;
                Document doc = searcher.Doc(docId);
                logger.Info(string.Format("{0}的分值为{1}", doc.Get("productid"), scoreDocs[i].Score));
            }
        }

        #endregion QueryIndex

        #region private
        private Commodity DocumentToCommodityInfo(Document doc)
        {
            return new Commodity()
                       {
                           Id = int.Parse(doc.Get("id")),
                           Title = doc.Get("title"),
                           ProductId = long.Parse(doc.Get("productid")),
                           CategoryId = int.Parse(doc.Get("categoryid")),
                           ImageUrl = doc.Get("iamgeurl"),
                           Price = decimal.Parse(doc.Get("price")),
                           Url = doc.Get("url")
                       };
        }

        #endregion private
    }

批量/单个索引的增删改

代码语言:javascript
复制
    /// <summary>
    /// 多线程的问题 :多文件写,然后合并
    /// 延时:异步队列
    /// 
    /// </summary>
    public class LuceneBulid : ILuceneBulid
    {
        #region Identity
        private Logger logger = new Logger(typeof(LuceneBulid));
        #endregion Identity

        #region 批量BuildIndex 索引合并
        /// <summary>
        /// 批量创建索引(要求是统一的sourceflag,即目录是一致的)
        /// </summary>
        /// <param name="ciList">sourceflag统一的</param>
        /// <param name="pathSuffix">索引目录后缀,加在电商的路径后面,为空则为根目录.如sa\1</param>
        /// <param name="isCreate">默认为false 增量索引  true的时候删除原有索引</param>
        public void BuildIndex(List<Commodity> ciList, string pathSuffix = "", bool isCreate = false)
        {
            IndexWriter writer = null;
            try
            {
                if (ciList == null || ciList.Count == 0)
                {
                    return;
                }

                string rootIndexPath = StaticConstant.IndexPath;
                string indexPath = string.IsNullOrWhiteSpace(pathSuffix) ? rootIndexPath : string.Format("{0}\\{1}", rootIndexPath, pathSuffix);

                DirectoryInfo dirInfo = Directory.CreateDirectory(indexPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, new PanGuAnalyzer(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                //writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                writer.SetMaxBufferedDocs(100);//控制写入一个新的segent前内存中保存的doc的数量 默认10  
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建复合文件 减少索引文件数量

                ciList.ForEach(c => CreateCIIndex(writer, c));
            }
            finally
            {
                if (writer != null)
                {
                    //writer.Optimize(); 创建索引的时候不做合并  merge的时候处理
                    writer.Close();
                }
            }
        }

        /// <summary>
        /// 将索引合并到上级目录
        /// </summary>
        /// <param name="sourceDir">子文件夹名</param>
        public void MergeIndex(string[] childDirs)
        {
            Console.WriteLine("MergeIndex Start");
            IndexWriter writer = null;
            try
            {
                if (childDirs == null || childDirs.Length == 0) return;
                Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
                string rootPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//删除原有的
                LuceneIO.Directory[] dirNo = childDirs.Select(dir => LuceneIO.FSDirectory.Open(Directory.CreateDirectory(string.Format("{0}\\{1}", rootPath, dir)))).ToArray();
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                writer.AddIndexesNoOptimize(dirNo);
            }
            finally
            {
                if (writer != null)
                {
                    writer.Optimize();
                    writer.Close();
                }
                Console.WriteLine("MergeIndex End");
            }
        }

        //Field.Store.YES:存储字段值(未分词前的字段值)        
        //Field.Store.NO:不存储,存储与索引没有关系         
        //Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损         
        //Field.Index.ANALYZED:分词建索引         
        //Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间         
        //Field.Index.NOT_ANALYZED:不分词且索引         
        //Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存         
        //TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数         
        //Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector         
        //Field.TermVector.NO:不存储TermVector         
        // Field.TermVector.WITH_POSITIONS:存储位置        
        //Field.TermVector.WITH_OFFSETS:存储偏移量         
        //Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量
        #endregion 批量BuildIndex 索引合并

        #region 单个/批量索引增删改
        /// <summary>
        /// 新增一条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        public void InsertIndex(Commodity ci)
        {
            IndexWriter writer = null;
            try
            {
                if (ci == null) return;
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);

                bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                CreateCIIndex(writer, ci);
            }
            catch (Exception ex)
            {
                logger.Error("InsertIndex异常", ex);
                throw ex;
            }
            finally
            {
                if (writer != null)
                {
                    //if (fileNum > 50)
                    //    writer.Optimize();
                    writer.Close();
                }
            }
        }

        /// <summary>
        /// 批量新增数据的索引
        /// </summary>
        /// <param name="ciList"></param>
        public void InsertIndexMuti(List<Commodity> ciList)
        {
            BuildIndex(ciList, "", false);
        }

        /// <summary>
        /// 批量删除数据的索引
        /// </summary>
        /// <param name="ciList"></param>
        public void DeleteIndexMuti(List<Commodity> ciList)
        {
            IndexReader reader = null;
            try
            {
                if (ciList == null || ciList.Count == 0) return;
                Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                reader = IndexReader.Open(directory, false);
                foreach (Commodity ci in ciList)
                {
                    reader.DeleteDocuments(new Term("productid", ci.ProductId.ToString()));
                }
            }
            catch (Exception ex)
            {
                logger.Error("DeleteIndex异常", ex);
                throw ex;
            }
            finally
            {
                if (reader != null)
                {
                    reader.Dispose();
                }
            }
        }

        /// <summary>
        /// 删除多条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        public void DeleteIndex(Commodity ci)
        {
            IndexReader reader = null;
            try
            {
                if (ci == null) return;
                Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                reader = IndexReader.Open(directory, false);
                reader.DeleteDocuments(new Term("productid", ci.ProductId.ToString()));
            }
            catch (Exception ex)
            {

                logger.Error("DeleteIndex异常", ex);
                throw ex;
            }
            finally
            {
                if (reader != null)
                {
                    reader.Dispose();
                }
            }
        }

        /////// <summary>
        /////// 更新一条数据的索引
        /////// </summary>
        //public void UpdateIndex(Commodity ci)
        //{
        //    DeleteIndex(ci);
        //    InsertIndex(ci);
        //}

        /// <summary>
        /// 更新一条数据的索引
        /// </summary>
        /// <param name="ci"></param>
        public void UpdateIndex(Commodity ci)
        {
            IndexWriter writer = null;
            try
            {
                if (ci == null) return;
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);

                bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                writer.UpdateDocument(new Term("productid", ci.ProductId.ToString()), ParseCItoDoc(ci));
            }
            catch (Exception ex)
            {
                logger.Error("InsertIndex异常", ex);
                throw ex;
            }
            finally
            {
                if (writer != null)
                {
                    //if (fileNum > 50)
                    //    writer.Optimize();
                    writer.Close();
                }
            }
        }

        /// <summary>
        /// 批量更新数据的索引
        /// </summary>
        /// <param name="ciList">sourceflag统一的</param>
        public void UpdateIndexMuti(List<Commodity> ciList)
        {
            IndexWriter writer = null;
            try
            {
                if (ciList == null || ciList.Count == 0) return;
                string rootIndexPath = StaticConstant.IndexPath;
                DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);

                bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                writer.MergeFactor = 50;//控制多个segment合并的频率,默认10
                writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                foreach (Commodity ci in ciList)
                {
                    writer.UpdateDocument(new Term("productid", ci.ProductId.ToString()), ParseCItoDoc(ci));
                }
            }
            catch (Exception ex)
            {
                logger.Error("InsertIndex异常", ex);
                throw ex;
            }
            finally
            {
                if (writer != null)
                {
                    //if (fileNum > 50)
                    //    writer.Optimize();
                    writer.Close();
                }
            }
        }
        #endregion 单个索引增删改

        #region PrivateMethod
        /// <summary>
        /// 创建分析器
        /// </summary>
        /// <returns></returns>
        private PerFieldAnalyzerWrapper CreateAnalyzerWrapper()
        {
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

            PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(analyzer);
            analyzerWrapper.AddAnalyzer("title", new PanGuAnalyzer());
            analyzerWrapper.AddAnalyzer("categoryid", new StandardAnalyzer(Version.LUCENE_30));
            return analyzerWrapper;
        }

        /// <summary>
        /// 创建索引
        /// </summary>
        /// <param name="analyzer"></param>
        /// <param name="title"></param>
        /// <param name="content"></param>
        private void CreateCIIndex(IndexWriter writer, Commodity ci)
        {
            try
            {
                writer.AddDocument(ParseCItoDoc(ci));
            }
            catch (Exception ex)
            {
                logger.Error("CreateCIIndex异常", ex);
                throw ex;
            }
        }

        /// <summary>
        /// 将Commodity转换成doc
        /// </summary>
        /// <param name="ci"></param>
        /// <returns></returns>
        private Document ParseCItoDoc(Commodity ci)
        {
            Document doc = new Document();

            doc.Add(new Field("id", ci.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("title", ci.Title, Field.Store.YES, Field.Index.ANALYZED));//盘古分词
            doc.Add(new Field("productid", ci.ProductId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("categoryid", ci.CategoryId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("imageurl", ci.ImageUrl, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("url", ci.Url, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new NumericField("price", Field.Store.YES, true).SetFloatValue((float)ci.Price));
            return doc;
        }

        #endregion PrivateMethod
    }

分词器封装

代码语言:javascript
复制
    public class LuceneAnalyze : ILuceneAnalyze
    {
        private Logger logger = new Logger(typeof(LuceneAnalyze));


        //
        #region AnalyzerKey
        /// <summary>
        /// 将搜索的keyword分词 
        /// 通过or 链接;查询更多的数据(贪婪查询)
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public string[] AnalyzerKey(string keyword)
        {
            Analyzer analyzer = new PanGuAnalyzer();
            QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer);
            Query query = parser.Parse(this.CleanKeyword(keyword));
            if (query is TermQuery)
            {
                Term term = ((TermQuery)query).Term;
                return new string[] { term.Text };
            }
            else if (query is PhraseQuery)
            {
                Term[] term = ((PhraseQuery)query).GetTerms();
                return term.Select(t => t.Text).ToArray();
            }
            else if (query is BooleanQuery)// and  or
            {
                BooleanClause[] clauses = ((BooleanQuery)query).GetClauses();
                List<string> analyzerWords = new List<string>();
                foreach (BooleanClause clause in clauses)
                {
                    Query childQuery = clause.Query;
                    if (childQuery is TermQuery)
                    {
                        Term term = ((TermQuery)childQuery).Term;
                        analyzerWords.Add(term.Text);
                    }
                    else if (childQuery is PhraseQuery)
                    {
                        Term[] term = ((PhraseQuery)childQuery).GetTerms();
                        analyzerWords.AddRange(term.Select(t => t.Text));
                    }
                }
                return analyzerWords.ToArray();
            }
            else
            {
                logger.Debug(string.Format("AnalyzerKey在解析keyword={0}的结果为new string[] { keyword } ", keyword));
                return new string[] { keyword };
            }
        }

        /// <summary>
        /// 清理头尾and or 关键字
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        private string CleanKeyword(string keyword)
        {
            if (string.IsNullOrWhiteSpace(keyword))
            { }
            else
            {
                bool isClean = false;
                while (!isClean)
                {
                    keyword = keyword.Trim();
                    if (keyword.EndsWith(" AND"))
                    {
                        keyword = string.Format("{0}and", keyword.Remove(keyword.Length - 3, 3));
                    }
                    else if (keyword.EndsWith(" OR"))
                    {
                        keyword = string.Format("{0}or", keyword.Remove(keyword.Length - 2, 2));
                    }
                    else if (keyword.StartsWith("AND "))
                    {
                        keyword = string.Format("and{0}", keyword.Substring(3));
                    }
                    else if (keyword.StartsWith("OR "))
                    {
                        keyword = string.Format("or{0}", keyword.Substring(2));
                    }
                    else if (keyword.Contains(" OR "))
                    {
                        keyword = keyword.Replace(" OR ", " or ");
                    }
                    else if (keyword.Contains(" AND "))
                    {
                        keyword = keyword.Replace(" AND ", " and ");
                    }
                    else
                        isClean = true;
                }

            }
            return QueryParser.Escape(keyword);
        }
        #endregion AnalyzerKey
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2021-01-27,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 查询
  • 批量/单个索引的增删改
  • 分词器封装
相关产品与服务
对象存储
对象存储(Cloud Object Storage,COS)是由腾讯云推出的无目录层次结构、无数据格式限制,可容纳海量数据且支持 HTTP/HTTPS 协议访问的分布式存储服务。腾讯云 COS 的存储桶空间无容量上限,无需分区管理,适用于 CDN 数据分发、数据万象处理或大数据计算与分析的数据湖等多种场景。
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档