ES全文检索pdf、word、txt等文本文件内容-Toy模板网

这篇具有很好参考价值的文章主要介绍了ES全文检索pdf、word、txt等文本文件内容。希望对大家有所帮助。如果存在错误或未考虑完全的地方，请大家不吝赐教，您也可以点击"举报违法"按钮提交疑问。

需求：
用ES对上传文件内容的检索和高亮显示。
之前从事于物联网行业，从多年前了解ES以后没有使用过，本篇文章就是为了记录小白用ES完成工作的过程。
Elasticsearch的介绍、安装和环境这里不过多介绍，网上有很多。
思考：
文本关键字搜索，文本需要上传elasticsearch。支持任意格式文件。纯文本文件应该很容易实现，而对于包含图片和文本的文件怎么处理?
es的文本抽取插件可以帮我们实现。
环境介绍：
由于是已有的环境，es版本已经确定好了，elasticsearch 8.6.2，看了一下官方网页，属于很新的版本（这样的版本意味遇到问题不好找原因和解决办法）
es 长文本检索,使用记录,全文检索,elasticsearch

es解析文本需要用到ingest attachment插件解析文件中的文本，需要先把文件转base64，具体官网有介绍https://www.elastic.co/guide/en/elasticsearch/reference/8.7/attachment.html 本次使用的es8.6.2版本已经把插件集成进来了，无需单独下载安装。低版本安装attchment插件：在安装目录下，
./bin/elasticsearch-plugin install ingest-attachment

创建索引库

PUT /file2
{
  "mappings": {
    "properties": {
      "deptId":{
        "type": "long"
      },
      "title":{
        "type": "text",
        "analyzer": "ik_smart"
      },
      "summary": {
          "type": "text",
		      "analyzer": "ik_smart"
      },
      "attachment": {
        "properties": {
          "content":{
            "type": "text",
            "analyzer": "ik_smart",
            "index_options" : "offsets"
          }
        }
      }
    }
  }
}

attachment指定抽取解析的文本内容

PUT _ingest/pipeline/attachment
{
  "description" : "Extract attachment information",
  "processors" : [
    {
      "attachment" : {
        "field" : "content",
        "remove_binary": false,
        "indexed_chars" : -1
      }
    }
  ]
}

“field” : “content”,指定文本字段端
“remove_binary”: false,保存base64文件内容 true不保存
“indexed_chars” : -1 不限制解析文件管道流的最大大小，不设置默认100000
因为要使用高亮，选择RestHighLevelClient，所以需要引入依赖

		<dependency>
			<groupId>org.elasticsearch.client</groupId>
			<artifactId>elasticsearch-rest-high-level-client</artifactId>
			<version>7.17.4</version>
		</dependency>

创建RestHighLevelClient对象

RestHighLevelClient restClient= new RestHighLevelClient(RestClient.builder(new HttpHost(elasticsearchServerIp, elasticsearchServerPort, "http")));

上传文档内容

    @Async
    public void addOrUpdateNew(String fileUrl ,String title,String summary) {
        try {
        	//文件标题
            fileEntity.setTitle(title);
            //文件摘要
            fileEntity.setSummary(summary);
            //判断文件类型
            String fileType = getFileTypeByDefaultTika(fileUrl);
            if (fileType != null) {
                if (!fileType.contains("video") && !fileType.contains("image") && !"application/zip".equals(fileType)) {
                    byte[] bytes = toByteArray(fileUrl);
                    String base64 = Base64.getEncoder().encodeToString(bytes);
                    fileEntity.setContent(base64);
                    fileEntity.setContentType(1);
                    String body = JSON.toJSONString(fileEntity);
                    IndexRequest indexRequest = new IndexRequest(endpoint)
                            .source(body, XContentType.JSON)
  							 //上传同时，使用attachment pipline进行提取文件
                            .setPipeline("attachment").timeout(TimeValue.timeValueMinutes(10));
                    restClient.index(indexRequest, RequestOptions.DEFAULT);
                } else {
                    fileEntity.setContentType(2);
                    String body = JSON.toJSONString(fileEntity);
                    IndexRequest indexRequest = new IndexRequest(endpoint)
                            .source(body, XContentType.JSON);
                    restClient.index(indexRequest, RequestOptions.DEFAULT);
                }
            }
        } catch (Exception e) {
//            e.printStackTrace();
        }
    }

分页、关键字、高亮查询

    /**
     * @param deptId 部门id
     * @param keyword 关键字
     * @param current 当前页
     * @param size 一页条数
     * @return PageVo 封装分页对象
     */
    public PageVo search(Long deptId,String keyword, Integer current, Integer size) {
        PageVo pageVo = new PageVo();
        pageVo.setSize(size);
        pageVo.setCurrent(current);
        try {

            //创建查询对象
            SearchRequest request = new SearchRequest("GET", endpoint);
            SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
            BoolQueryBuilder boolQueryBuilder = new BoolQueryBuilder();
            //设置查询条件
            boolQueryBuilder.filter(QueryBuilders.termsQuery("deptId",deptId))
            		.should(QueryBuilders.matchPhraseQuery("summary", keyword))
                    .should(QueryBuilders.matchPhraseQuery("title", keyword))
                    .should(QueryBuilders.matchPhraseQuery("attachment.content", keyword))
                    .minimumShouldMatch(1);
            //设置高亮
            HighlightBuilder hiBuilder = new HighlightBuilder();
            //设置高亮字段
            HighlightBuilder.Field title = new HighlightBuilder.Field("title");
            HighlightBuilder.Field summary = new HighlightBuilder.Field("summary");
            HighlightBuilder.Field content = new HighlightBuilder.Field("attachment.content");
            hiBuilder.field(title).field(summary).field(content);
            //设置高亮样式
            hiBuilder.preTags("<span style='color:red'>");
            hiBuilder.postTags("</span>");
            hiBuilder.fragmentSize(800000); //最大高亮分片数
            hiBuilder.numOfFragments(0); //从第一个分片获取高亮片段
            List<String> list = new ArrayList<>();
            list.add("content");
            searchSourceBuilder.from((current - 1) * size);
            searchSourceBuilder.size(size);
//            searchSourceBuilder.sort("_id", SortOrder.DESC);
            searchSourceBuilder.query(boolQueryBuilder).highlighter(hiBuilder)
                    //字段过滤  content字段是base64 影响查询速度 第一个参数结果集包括哪些字段，第二个参数表示结果集不包括哪些字段
                    .fetchSource(null, list.toArray(new String[list.size()]));
            //指定聚合条件
            request.source(searchSourceBuilder);
            //IndicesOptions.fromOptions的参数
            //ignore_unavailable ：是否忽略不可用的索引
            //allow_no_indices：是否允许索引不存在
            //expandToOpenIndices ：通配符表达式将扩展为打开的索引
            //expandToClosedIndices ：通配符表达式将扩展为关闭的索引
            request.indicesOptions(IndicesOptions.fromOptions(true, true, true, false));
            //查询到搜索结果
            SearchResponse search = restClient.search(request, RequestOptions.DEFAULT);
            //获取结果中的高亮对象
            SearchHits hits1 = search.getHits();
            //获取高亮总条数
            TotalHits totalHits = hits1.getTotalHits();
            //设置分页总条数
            pageVo.setTotal((int) totalHits.value);
            SearchHit[] hits = search.getHits().getHits();
            List<KnowledgeFile> ret = new ArrayList<>();
            for (SearchHit hit : hits) {
                String sourceAsString = hit.getSourceAsString();
                KnowledgeFile parsedObject = JSONObject.parseObject(sourceAsString, KnowledgeFile.class);
                Map map = JSONObject.parseObject(sourceAsString, Map.class);
                JSONObject attachment = (JSONObject) map.get("attachment");
                if (attachment != null && parsedObject.getContentType() != 2) {
                    Map map2 = JSONObject.parseObject(attachment.toJSONString(), Map.class);
                    String content1 = (String) map2.get("content");
                    parsedObject.setContent(content1);
                }
                Map<String, HighlightField> highlightFields = hit.getHighlightFields();
                KnowledgeFile knowledgeFile = new KnowledgeFile();
                if (highlightFields.get("title") != null) {
                    String highlightTitle = highlightFields.get("title").getFragments()[0].toString();
                    knowledgeFile.setTitle(highlightTitle);
                } else {
                    knowledgeFile.setTitle(parsedObject.getTitle());
                }
                if (highlightFields.get("summary") != null) {
                    String highlightSummary = highlightFields.get("summary").getFragments()[0].toString();
                    knowledgeFile.setSummary(highlightSummary);
                } else {
                    knowledgeFile.setSummary(parsedObject.getSummary());
                }
                if (parsedObject.getContentType() != 2) {
                    if (highlightFields.get("attachment.content") != null) {
                        String highlightContent = highlightFields.get("attachment.content").getFragments()[0].toString();
                        knowledgeFile.setContent(highlightContent.replaceAll("\\n", "<br/>"));
                    } else {
                        if (parsedObject.getContent() != null) {
                            knowledgeFile.setContent(parsedObject.getContent().replaceAll("\\n", "<br/>"));
                        }
                    }
                    knowledgeFile.setContentType(parsedObject.getContentType());
                } else {
                    knowledgeFile.setContentType(parsedObject.getContentType());
                }
                Map fileMap = JSONObject.parseObject(parsedObject.getFile(), Map.class);
                knowledgeFile.setFileName(String.valueOf(fileMap.get("fileName")));
                knowledgeFile.setFileUrl(String.valueOf(fileMap.get("fileUrl")));
                knowledgeFile.setFilePath(String.valueOf(fileMap.get("filePath")));
                ret.add(knowledgeFile);
            }

            pageVo.setResult(ret);
            return pageVo;
        } catch (Exception e) {
//            e.printStackTrace();
            return null;
        }
    }