elasticsearch

 

lucene : 倒排索引
如下: 我 (1:1) {0}  表示第一行出现一次,索引位置为0

  

 

 

 

 

elasticsearch 部署  elasticsearch-2.2.1.zip

192.168.112.101	node1
192.168.112.102	node2
192.168.112.103	node3

三台机器,每台机器上都部署。

es不能以root用户启动(因为es可以远程执行脚本,对于主机不安全)

## 所以三台主机都创建用户
[root@node2 ~]# useradd sxt
[root@node2 ~]# echo sxt | passwd --stdin sxt
[root@node2 ~]# mkdir -p /opt/sxt/es
[root@node2 ~]# cd /opt/sxt

[root@node1 sxt]# cd /opt/sxt/es/
[root@node1 es]# ll
total 28740
-rw-r--r--. 1 root root 29428075 Sep 10 21:18 elasticsearch-2.2.1.zip
[root@node1 sxt]# chown sxt:sxt es
[root@node1 sxt]# su sxt
[sxt@node1 sxt]$ cd es
[sxt@node1 es]$ ll
total 28740
-rw-r--r--. 1 root root 29428075 Sep 10 21:18 elasticsearch-2.2.1.zip
[sxt@node1 es]$ unzip elasticsearch-2.2.1.zip 
[sxt@node1 es]$ cd elasticsearch-2.2.1/config/elasticsearch.yml  ## 修改
cluster.name: bjsxt-es
node.name: node1
network.host: 192.168.112.101

discovery.zen.ping.multicast.enabled: false   ## 放在末尾
discovery.zen.ping.unicast.hosts: ["192.168.112.101","192.168.112.102", "192.168.112.103"]
discovery.zen.ping_timeout: 120s
client.transport.ping_timeout: 60s

[sxt@node1 es]$ scp -r elasticsearch-2.2.1 sxt@node2:`pwd`  ## 分发到node2和node3
[sxt@node1 bin]$ cd /opt/sxt/es/elasticsearch-2.2.1/bin
[sxt@node1 bin]$ ./elasticsearch   ## node2,node3都启动此命令    

 

 

配置json内容的格式化ui

02_第二阶段  hadoop体系之离线计算\12_EL SEARCH 搜索引擎\01资料\01资料\附件\plugins 将文件夹下的head上传到
[root@node1 plugins]# pwd
/opt/sxt/es/elasticsearch-2.2.1/plugins
[root@node1 plugins]# ll
total 4
drwxr-xr-x. 6 sxt sxt 4096 Sep 10 21:41 head  ## 注意权限head 为sxt

[root@node1 plugins]# chown -R sxt:sxt head

  

 

## 如果不小心以root用户启动,报错,如下。此时需要删除logs文件夹。否则再次以sxt启动也可能失败。
[root@node1 plugins]# cd /opt/sxt/es/elasticsearch-2.2.1/bin
[root@node1 bin]# ./elasticsearch
Exception in thread "main" java.lang.RuntimeException: don't run elasticsearch as root.
	at org.elasticsearch.bootstrap.Bootstrap.initializeNatives(Bootstrap.java:93)
	at org.elasticsearch.bootstrap.Bootstrap.setup(Bootstrap.java:144)
	at org.elasticsearch.bootstrap.Bootstrap.init(Bootstrap.java:285)
	at org.elasticsearch.bootstrap.Elasticsearch.main(Elasticsearch.java:35)

[root@node1 elasticsearch-2.2.1]# rm -rf logs
## 重新启动   ### ctrl+c 结束程序
[root@node1 elasticsearch-2.2.1]# su sxt
[sxt@node1 elasticsearch-2.2.1]$ cd /opt/sxt/es/elasticsearch-2.2.1/bin
[sxt@node1 bin]$ ./elasticsearch

## 访问页面内容如下;
http://node2:9200/_plugin/head/

 

 

横向扩展sharding切片,纵向扩展搭建ha.
一般lucense的分片不可修改,在规划时候需要考虑好,一经确认不可修改。(可以给分片做备份)

  

 

 

 

 

通过curl 操作es
[root@node1 plugins]# curl -XPUT http://192.168.112.101:9200/bjsxt/

如下:创建了lucene分片。粗体代表主分片,普通矩形框表示备分片

称为创建索引库 (相当于数据库)

  

 

 

 

node3挂掉后,出现短暂的警告,过一会儿又重新调整为如下第二图(达到健康状态了,自动备份了)。
再次重启node3.过一会如图第三。 * 代表是主。

 

  

 

 

 

 

 

 

curl -XPOST http://192.168.112.101:9200/bjsxt/employee -d '
{
 "first_name" : "bin",
 "age" : 33,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'
创建type和document.

[root@node1 plugins]# curl -XPUT http://192.168.112.101:9200/bjsxt/
{"acknowledged":true}[root@node1 plugins]# curl -XPOST http://192.168.112.101:9200/bjsxt/employee -d '
> {
>  "first_name" : "bin",
>  "age" : 33,
>  "about" : "I love to go rock climbing",
>  "interests": [ "sports", "music" ]
> }'
{"_index":"bjsxt","_type":"employee","_id":"AW0brHsbOCeeN2j3g-hG","_version":1,"_shards":{"total":2,"successful":2,"failed":0},"created":true}[root@node1 plugins]# 

  

 

 

curl -XPOST http://192.168.112.101:9200/bjsxt/employee -d '
{
 "first_name" : "gob bin",
 "age" : 43,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'


curl -XPOST http://192.168.112.101:9200/bjsxt/employee -d '
{
 "first_name" : "pablo2",
 "age" : 33,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ],
 "sex": "man"
}'

#XPUT 必须给出id 
curl -XPUT http://192.168.112.101:9200/bjsxt/employee/1 -d '  
{
 "first_name" : "god bin",
 "last_name" : "pang",
 "age" : 42,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'

## 修改age 44
curl -XPUT http://192.168.112.101:9200/bjsxt/employee/1 -d '
{
 "first_name" : "god bin",
 "last_name" : "pang",
 "age" : 44,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'

curl -XPOST http://192.168.112.101:9200/bjsxt/employee/1 -d '
{
 "first_name" : "pablo2",
 "age" : 33,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ],
 "sex": "man"
}'

## XPUT,XPOST 都可以做创建和修改。 XPUT 必须给出id,如果id不存在就创建,存在则修改。
XPOST 不用必须给定id

[root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/1?pretty
{
  "_index" : "bjsxt",
  "_type" : "employee",
  "_id" : "1",
  "_version" : 4,
  "found" : true,
  "_source" : {
    "first_name" : "pablo2",
    "age" : 33,
    "about" : "I love to go rock climbing",
    "interests" : [ "sports", "music" ],
    "sex" : "man"
  }
}

  

 

 

[root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?q=first_name="bin"
{"took":31,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":2,"max_score":0.079459734,"hits":[{"_index":"bjsxt","_type":"employee","_id":"AW0brHsbOCeeN2j3g-hG","_score":0.079459734,"_source":
{
 "first_name" : "bin",
 "age" : 33,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}},{"_index":"bjsxt","_type":"employee","_id":"AW0brvCeOCeeN2j3g-hH","_score":0.01125201,"_source":
{
 "first_name" : "gob bin",
 "age" : 43,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}}]}}

  

 

[root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
> {
>  "query":
>   {"match":
>    {"first_name":"bin"}
>   }
> }'
{
  "took" : 13,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 2,
    "max_score" : 1.0,
    "hits" : [ {
      "_index" : "bjsxt",
      "_type" : "employee",
      "_id" : "AW0brHsbOCeeN2j3g-hG",
      "_score" : 1.0,
      "_source" : {
        "first_name" : "bin",
        "age" : 33,
        "about" : "I love to go rock climbing",
        "interests" : [ "sports", "music" ]
      }
    }, {
      "_index" : "bjsxt",
      "_type" : "employee",
      "_id" : "AW0brvCeOCeeN2j3g-hH",
      "_score" : 0.19178301,
      "_source" : {
        "first_name" : "gob bin",
        "age" : 43,
        "about" : "I love to go rock climbing",
        "interests" : [ "sports", "music" ]
      }
    } ]
  }
}

  

[root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
> {
>  "query":
>   {"multi_match":
>    {
>     "query":"bin",
>     "fields":["last_name","first_name"],
>     "operator":"and"
>    }
>   }
> }'
{
  "took" : 13,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 2,
    "max_score" : 0.5906161,
    "hits" : [ {
      "_index" : "bjsxt",
      "_type" : "employee",
      "_id" : "AW0brHsbOCeeN2j3g-hG",
      "_score" : 0.5906161,
      "_source" : {
        "first_name" : "bin",
        "age" : 33,
        "about" : "I love to go rock climbing",
        "interests" : [ "sports", "music" ]
      }
    }, {
      "_index" : "bjsxt",
      "_type" : "employee",
      "_id" : "AW0brvCeOCeeN2j3g-hH",
      "_score" : 0.058849156,
      "_source" : {
        "first_name" : "gob bin",
        "age" : 43,
        "about" : "I love to go rock climbing",
        "interests" : [ "sports", "music" ]
      }
    } ]
  }
}

  

[root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
> {
>  "query":
>   {"bool" :
>    {
>     "must" : 
>      {"match":
>       {"first_name":"bin"}
>      },
>     "must" : 
>      {"match":
>       {"age":33}
>      }
>    }
>   }
> }'
{
  "took" : 10,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1,
    "max_score" : 1.163388,
    "hits" : [ {
      "_index" : "bjsxt",
      "_type" : "employee",
      "_id" : "AW0brHsbOCeeN2j3g-hG",
      "_score" : 1.163388,
      "_source" : {
        "first_name" : "bin",
        "age" : 33,
        "about" : "I love to go rock climbing",
        "interests" : [ "sports", "music" ]
      }
    } ]
  }
}

  

 

[root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
> {
>  "query":
>   {"bool" :
>    {
>     "must" : 
>      {"match":
>       {"first_name":"bin"}
>      },
>     "must_not" : 
>      {"match":
>       {"age":33}
>      }
>    }
>   }
> }'
{
  "took" : 8,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1,
    "max_score" : 0.19178301,
    "hits" : [ {
      "_index" : "bjsxt",
      "_type" : "employee",
      "_id" : "AW0brvCeOCeeN2j3g-hH",
      "_score" : 0.19178301,
      "_source" : {
        "first_name" : "gob bin",
        "age" : 43,
        "about" : "I love to go rock climbing",
        "interests" : [ "sports", "music" ]
      }
    } ]
  }
}

  

[root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search?pretty -d '
> {
>  "query":
>   {"bool" :
>    {
>     "must_not" : 
>      {"match":
>       {"first_name":"bin"}
>      },
>     "must_not" : 
>      {"match":
>       {"age":33}
>      }
>    }
>   }
> }'
{
  "took" : 10,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 0,
    "max_score" : null,
    "hits" : [ ]
  }
}

  

以集合的方式思考

 

[root@node1 plugins]# curl -XGET http://192.168.112.101:9200/bjsxt/employee/_search -d '
> {
>  "query":
>   {"bool" :
>    {
>    "must" :
>     {"term" : 
>      { "first_name" : "bin" }
>     }
>    ,
>    "must_not" : 
>     {"range":
>      {"age" : { "from" : 20, "to" : 33 }
>     }
>    }
>    }
>   }
> }'
{"took":17,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":1,"max_score":0.19178301,"hits":[{"_index":"bjsxt","_type":"employee","_id":"AW0brvCeOCeeN2j3g-hH","_score":0.19178301,"_source":
{
 "first_name" : "gob bin",
 "age" : 43,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]

  

 

curl -XPUT 'http://192.168.112.101:9200/test2/' -d'{"settings":{"number_of_replicas":2}}'

  

 

 

curl -XPUT 'http://192.168.112.101:9200/test3/' -d'{"settings":{"number_of_shards":3,"number_of_replicas":3}}'

  

 

 

file
segment(段,多个document组成)
document(一条记录,一个对象实例)
field(对象的属性)
term(项,分词之后的词条)



# yes
curl -XPUT http://192.168.133.6:9200/bjsxt/
# yes 
curl -XDELETE http://192.168.133.6:9200/test2/
curl -XDELETE http://192.168.133.6:9200/test3/

#document:yes 
curl -XPOST http://192.168.133.6:9200/bjsxt/employee -d '
{
 "first_name" : "bin",
 "age" : 33,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'

curl -XPOST http://192.168.133.6:9200/bjsxt/employee -d '
{
 "first_name" : "gob bin",
 "age" : 43,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'

curl -XPOST http://192.168.133.6:9200/bjsxt/employee/2 -d '
{
 "first_name" : "bin",
 "age" : 45,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'


#add field yes

curl -XPOST http://192.168.133.6:9200/bjsxt/employee -d '
{
 "first_name" : "pablo2",
 "age" : 33,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ],
 "sex": "man"
}'

curl -XPOST http://192.168.133.6:9200/bjsxt/employee/1 -d '
{
 "first_name" : "pablo2",
 "age" : 35,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ],
 "sex": "man"
}'


----------------------------------------


#put:yes


curl -XPUT http://192.168.133.6:9200/bjsxt/employee/1 -d '
{
 "first_name" : "god bin",
 "last_name" : "pang",
 "age" : 42,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'

curl -XPUT http://192.168.133.6:9200/bjsxt/employee -d '
{
 "first_name" : "god bin",
 "last_name" : "bin",
 "age" : 45,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'


curl -XPUT http://192.168.133.6:9200/bjsxt/employee/2 -d '
{
 "first_name" : "god bin",
 "last_name" : "bin",
 "age" : 45,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'

curl -XPUT http://192.168.133.6:9200/bjsxt/employee/1 -d '
{
 "first_name" : "god bin",
 "last_name" : "pang",
 "age" : 40,
 "about" : "I love to go rock climbing",
 "interests": [ "sports", "music" ]
}'



#根据document的id来获取数据:(without pretty)
curl -XGET http://192.168.133.6:9200/bjsxt/employee/1?pretty

#根据field来查询数据:
curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?q=first_name="bin"

#根据field来查询数据:match
curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
{
 "query":
  {"match":
   {"first_name":"bin"}
  }
}'



#对多个field发起查询:multi_match
curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
{
 "query":
  {"multi_match":
   {
    "query":"bin",
    "fields":["last_name","first_name"],
    "operator":"and"
   }
  }
}'


#多个term对多个field发起查询:bool(boolean) 
# 组合查询,must,must_not,should 
#  must + must : 交集
#  must +must_not :差集
#  should+should  : 并集

curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
{
 "query":
  {"bool" :
   {
    "must" : 
     {"match":
      {"first_name":"bin"}
     },
    "must" : 
     {"match":
      {"age":33}
     }
   }
  }
}'

curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
{
 "query":
  {"bool" :
   {
    "must" : 
     {"match":
      {"first_name":"bin"}
     },
    "must_not" : 
     {"match":
      {"age":33}
     }
   }
  }
}'





curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search?pretty -d '
{
 "query":
  {"bool" :
   {
    "must_not" : 
     {"match":
      {"first_name":"bin"}
     },
    "must_not" : 
     {"match":
      {"age":33}
     }
   }
  }
}'

##查询first_name=bin的,或者年龄在20岁到33岁之间的

curl -XGET http://192.168.133.6:9200/bjsxt/employee/_search -d '
{
 "query":
  {"bool" :
   {
   "must" :
    {"term" : 
     { "first_name" : "bin" }
    }
   ,
   "must_not" : 
    {"range":
     {"age" : { "from" : 20, "to" : 33 }
    }
   }
   }
  }
}'


#修改配置
curl -XPUT 'http://192.168.133.6:9200/test2/' -d'{"settings":{"number_of_replicas":2}}'

curl -XPUT 'http://192.168.133.6:9200/test3/' -d'{"settings":{"number_of_shards":3,"number_of_replicas":3}}'

curl -XPUT 'http://192.168.133.6:9200/test4/' -d'{"settings":{"number_of_shards":6,"number_of_replicas":4}}'


curl -XPOST http://192.168.9.11:9200/bjsxt/person/_mapping -d'
{
    "person": {
        "properties": {
            "content": {
                "type": "string",
                "store": "no",
                "term_vector": "with_positions_offsets",
                "analyzer": "ik_max_word",
                "search_analyzer": "ik_max_word",
                "include_in_all": "true",
                "boost": 8
            }
        }
    }
}'

  

 

官网
https://www.elastic.co/guide/index.html
https://www.elastic.co/guide/en/elasticsearch/client/index.html
https://www.elastic.co/guide/en/elasticsearch/client/java-api/index.html
https://www.elastic.co/guide/en/elasticsearch/client/java-api/2.2/transport-client.html

 

 

 

爬取数据,作为document的原始文件。在linux上
yum install wget  
## 如下命令爬取 http://news.cctv.com;并且按照原有网站的url目录存储到data下
wget -o /tmp/wget.log -P /root/data  --no-parent --no-verbose -m -D news.cctv.com   -N --convert-links --random-wait -A html,HTML,shtml,SHTML http://news.cctv.com
配置分词器
https://github.com/medcl/elasticsearch-analysis-ik 
版本必须与es相对应

elasticsearch-2.2.1.zip 
elasticsearch-analysis-ik-1.8.0.zip  ## 
[sxt@node1 ik]$ pwd
/opt/sxt/es/elasticsearch-2.2.1/plugins/ik  ## 修改如下配置文件
[sxt@node1 ik]$ cat plugin-descriptor.properties | grep version=
elasticsearch.version=2.2.1 ## 版本号也修改对应。

## 启动es.

## 运行java程序  createIndex

package com.sxt.es;

import java.io.File;
import java.net.InetAddress;
import java.util.HashMap;
import java.util.Map;

import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.Requests;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryParser;
import org.elasticsearch.index.query.RangeQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.junit.Test;
import org.springframework.stereotype.Service;

import com.sxt.util.HtmlTool;

@Service
public class IndexService {

	//存放html文件的目录
//	public static String DATA_DIR="C:\\data\\";
	public static String DATA_DIR="d:\\data\\";
	
	public static Client client;

	static {
		Settings settings = Settings.settingsBuilder()
				.put("cluster.name", "bjsxt-es").build();
		try {
			client = TransportClient
					.builder()
					.settings(settings)
					.build()
					.addTransportAddress(
							new InetSocketTransportAddress(InetAddress
									.getByName("node1"), 9300))
					.addTransportAddress(
							new InetSocketTransportAddress(InetAddress
									.getByName("node2"), 9300))
					.addTransportAddress(
							new InetSocketTransportAddress(InetAddress
									.getByName("node3"), 9300));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * admin():管理索引库的。client.admin().indices()
	 * 
	 * 索引数据的管理:client.prepare
	 * 
	 */
	@Test
	public void createIndex() throws Exception {
		IndicesExistsResponse resp = client.admin().indices().prepareExists("bjsxt").execute().actionGet();
		if(resp.isExists()){
			client.admin().indices().prepareDelete("bjsxt").execute().actionGet();
		}
		client.admin().indices().prepareCreate("bjsxt").execute().actionGet();

		new XContentFactory();

		XContentBuilder builder = XContentFactory.jsonBuilder().startObject()
				.startObject("htmlbean").startObject("properties")
				.startObject("title").field("type", "string")
				.field("store", "yes").field("analyzer", "ik_max_word")
				.field("search_analyzer", "ik_max_word").endObject()
				.startObject("content").field("type", "string")
				.field("store", "yes").field("analyzer", "ik_max_word")
				.field("search_analyzer", "ik_max_word").endObject()
//				.startObject("url").field("type", "string")
//				.field("store", "yes").field("analyzer", "ik_max_word")
//				.field("search_analyzer", "ik_max_word").endObject()
				.endObject().endObject().endObject();
		PutMappingRequest mapping = Requests.putMappingRequest("bjsxt").type("htmlbean").source(builder);
		client.admin().indices().putMapping(mapping).actionGet();

	}
	
	/**
	 * 把源数据html文件添加到索引库中(构建索引文件)
	 */
	@Test
	public void addHtmlToES(){
		readHtml(new File(DATA_DIR));
	}
	
	/**
	 * 遍历数据文件目录d:/data ,递归方法
	 * @param file
	 */
	public void readHtml(File file){
		if(file.isDirectory()){
			File[]  fs =file.listFiles();
			for (int i = 0; i < fs.length; i++) {
				File f = fs[i];
				readHtml(f);
			}
		}else{
			HtmlBean bean;
			try {
				bean = HtmlTool.parserHtml(file.getPath());
				if(bean!=null){
					Map<String, String> dataMap =new HashMap<String, String>();
					dataMap.put("title", bean.getTitle());
					dataMap.put("content", bean.getContent());
					dataMap.put("url", bean.getUrl());
					//写索引
					client.prepareIndex("bjsxt", "htmlbean").setSource(dataMap).execute().actionGet();
				}
			} catch (Throwable e) {
				e.printStackTrace();
			}
			
		}
	}
	
	/**
	 * 搜索
	 * @param kw
	 * @param num
	 * @return
	 */
	public PageBean<HtmlBean> search(String kw,int num,int count){
		PageBean<HtmlBean> wr =new PageBean<HtmlBean>();
		wr.setIndex(num);
//		//构建查询条件
//		MatchQueryBuilder q1 =new MatchQueryBuilder("title", kw);
//		MatchQueryBuilder q2 =new MatchQueryBuilder("content", kw);
//		
//		//构建一个多条件查询对象
//		BoolQueryBuilder q =new BoolQueryBuilder(); //组合查询条件对象
//		q.should(q1);
//		q.should(q2);
		
//		RangeQueryBuilder q1 =new RangeQueryBuilder("age");
//		q1.from(18);
//		q1.to(40);
		
		MultiMatchQueryBuilder q =new MultiMatchQueryBuilder(kw, new String[]{"title","content"});
		SearchResponse resp=null;
		if(wr.getIndex()==1){
			resp = client.prepareSearch("bjsxt")
					.setTypes("htmlbean")
					.setQuery(q)
					.addHighlightedField("title")
					.addHighlightedField("content")
					.setHighlighterPreTags("<font color=\"red\">")
					.setHighlighterPostTags("</font>")
					.setHighlighterFragmentSize(40)//设置显示结果中一个碎片段的长度
					.setHighlighterNumOfFragments(5)//设置显示结果中每个结果最多显示碎片段,每个碎片段之间用...隔开
					.setFrom(0)
					.setSize(10)
					.execute().actionGet();
			
		}else{
			wr.setTotalCount(count);
			resp = client.prepareSearch("bjsxt")
					.setTypes("htmlbean")
					.setQuery(q)
					.addHighlightedField("title")
					.addHighlightedField("content")
					.setHighlighterPreTags("<font color=\"red\">")
					.setHighlighterPostTags("</font>")
					.setHighlighterFragmentSize(40)
					.setHighlighterNumOfFragments(5)
					.setFrom(wr.getStartRow())
					.setSize(10)
					.execute().actionGet();
		}
		SearchHits hits= resp.getHits();
		wr.setTotalCount((int)hits.getTotalHits());
		
		for(SearchHit hit : hits.getHits()){
			HtmlBean bean =new HtmlBean();
			if(hit.getHighlightFields().get("title")==null){//title中没有包含关键字
				bean.setTitle(hit.getSource().get("title").toString());//获取原来的title(没有高亮的title)
			}else{
				bean.setTitle(hit.getHighlightFields().get("title").getFragments()[0].toString());
			}
			if(hit.getHighlightFields().get("content")==null){//title中没有包含关键字
				bean.setContent(hit.getSource().get("content").toString());//获取原来的title(没有高亮的title)
			}else{
				StringBuilder sb =new StringBuilder();
				for(Text text: hit.getHighlightFields().get("content").getFragments()){
					sb.append(text.toString()+"...");
				}
				bean.setContent(sb.toString());
			}
			
			bean.setUrl("http://"+hit.getSource().get("url").toString());
			wr.setBean(bean);
			
		}
		
		
		return wr;
	}
	
	
//	@Test
//	public void del(){
////		client.admin().indices().prepareDelete("bjsxt").execute().actionGet();
//		client.admin().indices().prepareDelete("bjsxt2").execute().actionGet();
//	}
}

## 将linux wget 爬取到的数据存放到D:\\下。
## 运行addHtmlToES()方法,数据文档添加到es中

## 如下时对项目:ES_SEARCH的演示效果。

  

 

 

window 查看端口和pid,杀死pid
C:\WINDOWS\system32>netstat -ano | findstr 8080
  TCP    0.0.0.0:8080           0.0.0.0:0              LISTENING       9448
  TCP    [::]:8080              [::]:0                 LISTENING       9448

C:\WINDOWS\system32>taskkill /PID 9448 /F

  

posted @ 2019-09-10 22:07  星回中道  阅读(368)  评论(0编辑  收藏  举报