nebula 算法提交spark任务时报错 NoSuchMethodError java.lang.String com.google.common.net.HostAndPort.getHostText 解决
写在前面,根因问题是我下载了spark without hadoop版本导致,应该使用spark with hadoop 版本!!!
----------------------------------------------
参考步骤:https://docs.nebula-graph.com.cn/3.1.0/nebula-algorithm/#_2
错误:
2022-05-31 17:40:16,459 WARN [main] util.Utils (Logging.scala:logWarning(66)) - Your hostname, bonelee-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3) 2022-05-31 17:40:16,461 WARN [main] util.Utils (Logging.scala:logWarning(66)) - Set SPARK_LOCAL_IP if you need to bind to another address Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)V at org.apache.hadoop.conf.Configuration.set(Configuration.java:1357) at org.apache.hadoop.conf.Configuration.set(Configuration.java:1338) at org.apache.spark.deploy.SparkHadoopUtil$.org$apache$spark$deploy$SparkHadoopUtil$$appendS3AndSparkHadoopConfigurations(SparkHadoopUtil.scala:464) at org.apache.spark.deploy.SparkHadoopUtil$.newConfiguration(SparkHadoopUtil.scala:436) at org.apache.spark.deploy.SparkSubmit$$anonfun$2.apply(SparkSubmit.scala:323) at org.apache.spark.deploy.SparkSubmit$$anonfun$2.apply(SparkSubmit.scala:323) at scala.Option.getOrElse(Option.scala:121) at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:323) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:784) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161) at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184) at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86) at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:930) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:939) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
提交方式:
spark-submit --master "local" \ --conf spark.app.name="g1" \ --conf spark.executor.extraLibraryPath=/home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \ --conf spark.executor.extraClassPath=/home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \ --driver-class-path /home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \ --driver-library-path /home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \ --class com.vesoft.nebula.algorithm.Main nebula-algorithm/target/nebula-algorithm-3.0.0.jar -p /home/bonelee/Desktop/nebula-algorithm/application.conf
版本说明
Scala 2.11.12 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_333).
Spark: 2.4.8-bin-without-hadoop
hadoop:3.2.3
nebula:3.0.0
nebula-algorithm:3.0.0
application配置:
{
# Spark relation config
spark: {
app: {
name: LPA
# spark.app.partitionNum
partitionNum:100
}
master:local
}
data: {
# data source. optional of nebula,csv,json
source: nebula
# data sink, means the algorithm result will be write into this sink. optional of nebula,csv,text
sink: csv
# if your algorithm needs weight
hasWeight: false
}
# Nebula Graph relation config
nebula: {
# algo's data source from Nebula. If data.source is nebula, then this nebula.read config can be valid.
read: {
# Nebula metad server address, multiple addresses are split by English comma
metaAddress: "127.0.0.1:9559"
# Nebula space
space: basketballplayer
# Nebula edge types, multiple labels means that data from multiple edges will union together
labels: ["serve"]
# Nebula edge property name for each edge type, this property will be as weight col for algorithm.
# Make sure the weightCols are corresponding to labels.
weightCols: ["start_year"]
}
# algo result sink into Nebula. If data.sink is nebula, then this nebula.write config can be valid.
write:{
# Nebula graphd server address, multiple addresses are split by English comma
graphAddress: "127.0.0.1:9669"
# Nebula metad server address, multiple addresses are split by English comma
metaAddress: "127.0.0.1:9559"
user:root
pswd:nebula
# Nebula space name
space:nb
# Nebula tag name, the algorithm result will be write into this tag
tag:pagerank
# algorithm result is insert into new tag or update to original tag. type: insert/update
type:insert
}
}
local: {
# algo's data source from Nebula. If data.source is csv or json, then this local.read can be valid.
read:{
filePath: "file:///tmp/algo_edge.csv"
srcId:"src"
# dstId column
dstId:"dst"
# weight column
weight: "weight"
# if csv file has header
header: true
# csv file's delimiter
delimiter:","
}
# algo result sink into local file. If data.sink is csv or text, then this local.write can be valid.
write:{
resultPath:/tmp/count
}
}
algorithm: {
# the algorithm that you are going to execute,pick one from [pagerank, louvain, connectedcomponent,
# labelpropagation, shortestpaths, degreestatic, kcore, stronglyconnectedcomponent, trianglecount,
# betweenness, graphtriangleCount, clusteringcoefficient, bfs, hanp, closeness, jaccard, node2vec]
executeAlgo: graphtrianglecount
# PageRank parameter
pagerank: {
maxIter: 10
resetProb: 0.15 # default 0.15
}
# Louvain parameter
louvain: {
maxIter: 20
internalIter: 10
tol: 0.5
}
# connected component parameter.
connectedcomponent: {
maxIter: 20
}
# LabelPropagation parameter
labelpropagation: {
maxIter: 20
}
# ShortestPaths parameter
shortestpaths: {
# several vertices to compute the shortest path to all vertices.
landmarks: "1"
}
# Vertex degree statistics parameter
degreestatic: {}
# KCore parameter
kcore:{
maxIter:10
degree:1
}
# Trianglecount parameter
trianglecount:{}
# graphTriangleCount parameter
graphtrianglecount:{}
# Betweenness centrality parameter. maxIter parameter means the max times of iterations.
betweenness:{
maxIter:5
}
# Clustering Coefficient parameter. The type parameter has two choice, local or global
# local type will compute the clustering coefficient for each vertex, and print the average coefficient for graph.
# global type just compute the graph's clustering coefficient.
clusteringcoefficient:{
type: local
}
# ClosenessAlgo parameter
closeness:{}
# BFS parameter
bfs:{
maxIter:5
root:"10"
}
# HanpAlgo parameter
hanp:{
hopAttenuation:0.1
maxIter:10
preference:1.0
}
#Node2vecAlgo parameter
node2vec:{
maxIter: 10,
lr: 0.025,
dataNumPartition: 10,
modelNumPartition: 10,
dim: 10,
window: 3,
walkLength: 5,
numWalks: 3,
p: 1.0,
q: 1.0,
directed: false,
degree: 30,
embSeparate: ",",
modelPath: "hdfs://127.0.0.1:9000/model"
}
# JaccardAlgo parameter
jaccard:{
tol: 1.0
}
}
}
如果将上述数据源nebula修改为csv则运行不会出错!!!
------------------------------------------------------------------------------------------------------------------------------
使用nebula graph 算法提交spark任务时报错:读取数据源使用的nebula!!!
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
Exception in thread "main" java.lang.NoSuchMethodError: 'java.lang.String com.google.common.net.HostAndPort.getHostText()' at com.vesoft.nebula.connector.NebulaOptions$$anonfun$getMetaAddress$1.apply(NebulaOptions.scala:186) at com.vesoft.nebula.connector.NebulaOptions$$anonfun$getMetaAddress$1.apply(NebulaOptions.scala:183) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at com.vesoft.nebula.connector.NebulaOptions.getMetaAddress(NebulaOptions.scala:183) at com.vesoft.nebula.connector.reader.NebulaSourceReader.getSchema(NebulaSourceReader.scala:45) at com.vesoft.nebula.connector.reader.NebulaSourceReader.readSchema(NebulaSourceReader.scala:30) at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation$.create(DataSourceV2Relation.scala:175) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:223) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186) at com.vesoft.nebula.connector.connector.package$NebulaDataFrameReader.loadEdgesToDF(package.scala:172) at com.vesoft.nebula.algorithm.reader.NebulaReader$$anonfun$read$1.apply$mcVI$sp(DataReader.scala:52) at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:160) at com.vesoft.nebula.algorithm.reader.NebulaReader.read(DataReader.scala:38) at com.vesoft.nebula.algorithm.Main$.createDataSource(Main.scala:118) at com.vesoft.nebula.algorithm.Main$.main(Main.scala:84) at com.vesoft.nebula.algorithm.Main.main(Main.scala) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:568) at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52) at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:855) at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161) at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184) at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86) at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:930) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:939) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) |
提交方式:
|
1
2
3
4
5
6
7
|
/home/bonelee/spark-2.4.8-bin-without-hadoop/bin/spark-submit --master "local" \--class com.vesoft.nebula.algorithm.Main nebula-algorithm/target/nebula-algorithm-3.0.0.jar -p /home/bonelee/Desktop/nebula-algorithm/application.conf#--driver-class-path /home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \#--driver-library-path /home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \#--conf spark.executor.extraLibraryPath=/home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \#--conf spark.executor.extraClassPath=/home/bonelee/Desktop/nebula-algorithm/guava-14.0.jar \ |
即便是使用:https://www.cnblogs.com/1394htw/p/15151913.html 文中说的方法,使用guava-14.0.jar依然会报错!
真郁闷,折腾了一下午。。。没有搞定。。。
绕过该问题的解决方法那就是:
修改application.conf文件数据源:
data: {
# data source. optional of nebula,csv,json
source: nebula
# data sink, means the algorithm result will be write into this sink. optional of nebula,csv,text
sink: csv
# if your algorithm needs weight
hasWeight: false
}
将上面的nebula修改为csv,同时配置下:
local: {
# algo's data source from Nebula. If data.source is csv or json, then this local.read can be valid.
read:{
filePath: "file:///tmp/algo_edge.csv"
srcId:"src"
# dstId column
dstId:"dst"
# weight column
weight: "weight"
# if csv file has header
header: true
# csv file's delimiter
delimiter:","
}
# algo result sink into local file. If data.sink is csv or text, then this local.write can be valid.
write:{
resultPath:/tmp/count
}
}
algo_edge.csv文件内容:
src,dst,weight 1,1,5.0 1,2,1.0 1,3,5.0 1,4,1.0 2,1,5.0 2,2,1.0 2,3,5.0 2,4,1.0 3,1,1.0 3,2,5.0 3,3,1.0 3,4,5.0 4,1,1.0 4,2,5.0 4,3,1.0 4,4,5.0
这个时候,是可以正常输出结果的!
bonelee@bonelee-VirtualBox:~/Desktop/nebula-algorithm-2.6$ cat /tmp/count/part-00000-7ae75bc1-80e7-4469-9dda-268a8036db09-c000.csv count 4
----------------------------------------------------
20220613解决方法,是在windows下spark 2.4.7跑通的,我要跪了。。。
关键步骤:
1、linux下修改配置文件
将etc/下面的nebula-graphd.conf,nebula-storaged.conf,nebula-metad.conf里的127.0.0.1修改成外部访问地址,我的是:8.35.24.181
########## networking ########## # Comma separated Meta Server Addresses --meta_server_addrs=8.35.24.181:9559 # Local IP used to identify the nebula-graphd process. # Change it to an address other than loopback if the service is distributed or # will be accessed remotely. --local_ip=8.35.24.181
然后记得add hosts,看到上线:
(root@nebula) [algo_edge]> show hosts +---------------+------+-----------+-----------+--------------+----------------------+-----------------------------------+---------+ | Host | Port | HTTP port | Status | Leader count | Leader distribution | Partition distribution | Version | +---------------+------+-----------+-----------+--------------+----------------------+-----------------------------------+---------+ | "127.0.0.1" | 9779 | 19669 | "OFFLINE" | 0 | "No valid partition" | "basketballplayer:10, cadets:100" | "3.1.0" | | "8.35.24.181" | 9779 | 19669 | "ONLINE" | 10 | "algo_edge:10" | "algo_edge:10" | "3.1.0" | +---------------+------+-----------+-----------+--------------+----------------------+-----------------------------------+---------+ Got 2 rows (time spent 3122/3893 us) Mon, 13 Jun 2022 18:41:25 CST
2、图数据库里插入数据:
drop space algo_edge; create space algo_edge(partition_num=10,replica_factor=1,vid_type=INT64); :sleep 20 use algo_edge; create tag player(id int); create edge serve(serve_year int); :sleep 20 insert vertex player(id) values 1:(1); insert vertex player(id) values 2:(2); insert vertex player(id) values 3:(3); insert vertex player(id) values 4:(4); insert edge serve(serve_year) values 1->1:(5); insert edge serve(serve_year) values 1->2:(1); insert edge serve(serve_year) values 1->3:(5); insert edge serve(serve_year) values 1->4:(1); insert edge serve(serve_year) values 2->1:(5); insert edge serve(serve_year) values 2->2:(1); insert edge serve(serve_year) values 2->3:(5); insert edge serve(serve_year) values 2->4:(1); insert edge serve(serve_year) values 3->1:(1); insert edge serve(serve_year) values 3->2:(5); insert edge serve(serve_year) values 3->3:(1); insert edge serve(serve_year) values 3->4:(5); insert edge serve(serve_year) values 4->1:(1); insert edge serve(serve_year) values 4->2:(5); insert edge serve(serve_year) values 4->3:(1); insert edge serve(serve_year) values 4->4:(5);
3、修改app.conf, 尤其是nebula连接那部分:
{
# Spark relation config
spark: {
app: {
name: LPA
# spark.app.partitionNum
partitionNum:100
}
master:local
}
data: {
# data source. optional of nebula,csv,json
source: nebula
# data sink, means the algorithm result will be write into this sink. optional of nebula,csv,text
sink: csv
# if your algorithm needs weight
hasWeight: false
}
# Nebula Graph relation config
nebula: {
# algo's data source from Nebula. If data.source is nebula, then this nebula.read config can be valid.
read: {
# Nebula metad server address, multiple addresses are split by English comma
metaAddress: "8.35.24.181:9559"
# Nebula space
space: algo_edge
# Nebula edge types, multiple labels means that data from multiple edges will union together
labels: ["serve"]
# Nebula edge property name for each edge type, this property will be as weight col for algorithm.
# Make sure the weightCols are corresponding to labels.
weightCols: ["serve_year"]
}
# algo result sink into Nebula. If data.sink is nebula, then this nebula.write config can be valid.
write:{
# Nebula graphd server address, multiple addresses are split by English comma
graphAddress: "127.0.0.1:9669"
# Nebula metad server address, multiple addresses are split by English comma
metaAddress: "127.0.0.1:9559"
user:root
pswd:nebula
# Nebula space name
space:nb
# Nebula tag name, the algorithm result will be write into this tag
tag:pagerank
# algorithm result is insert into new tag or update to original tag. type: insert/update
type:insert
}
}
local: {
# algo's data source from Nebula. If data.source is csv or json, then this local.read can be valid.
read:{
filePath: "file:///D:/tmp/algo_edge.csv"
srcId:"src"
# dstId column
dstId:"dst"
# weight column
weight: "weight"
# if csv file has header
header: true
# csv file's delimiter
delimiter:","
}
# algo result sink into local file. If data.sink is csv or text, then this local.write can be valid.
write:{
resultPath: "file:///D:/tmp/result0613002"
}
}
algorithm: {
# the algorithm that you are going to execute,pick one from [pagerank, louvain, connectedcomponent,
# labelpropagation, shortestpaths, degreestatic, kcore, stronglyconnectedcomponent, trianglecount,
# betweenness, graphtriangleCount, clusteringcoefficient, bfs, hanp, closeness, jaccard, node2vec]
executeAlgo: graphtrianglecount
# PageRank parameter
pagerank: {
maxIter: 10
resetProb: 0.15 # default 0.15
}
# Louvain parameter
louvain: {
maxIter: 20
internalIter: 10
tol: 0.5
}
# connected component parameter.
connectedcomponent: {
maxIter: 20
}
# LabelPropagation parameter
labelpropagation: {
maxIter: 20
}
# ShortestPaths parameter
shortestpaths: {
# several vertices to compute the shortest path to all vertices.
landmarks: "1"
}
# Vertex degree statistics parameter
degreestatic: {}
# KCore parameter
kcore:{
maxIter:10
degree:1
}
# Trianglecount parameter
trianglecount:{}
# graphTriangleCount parameter
graphtrianglecount:{}
# Betweenness centrality parameter. maxIter parameter means the max times of iterations.
betweenness:{
maxIter:5
}
# Clustering Coefficient parameter. The type parameter has two choice, local or global
# local type will compute the clustering coefficient for each vertex, and print the average coefficient for graph.
# global type just compute the graph's clustering coefficient.
clusteringcoefficient:{
type: local
}
# ClosenessAlgo parameter
closeness:{}
# BFS parameter
bfs:{
maxIter:5
root:"10"
}
# HanpAlgo parameter
hanp:{
hopAttenuation:0.1
maxIter:10
preference:1.0
}
#Node2vecAlgo parameter
node2vec:{
maxIter: 10,
lr: 0.025,
dataNumPartition: 10,
modelNumPartition: 10,
dim: 10,
window: 3,
walkLength: 5,
numWalks: 3,
p: 1.0,
q: 1.0,
directed: false,
degree: 30,
embSeparate: ",",
modelPath: "hdfs://127.0.0.1:9000/model"
}
# JaccardAlgo parameter
jaccard:{
tol: 1.0
}
}
}
4、然后运行spark 就可以成功连接nebula数据源并运行了!!!
spark-submit --master "local" --class com.vesoft.nebula.algorithm.Main nebula-algorithm-3.0.0.jar -p c.conf
注意,我是安装的带hadoop的版本!
D:\app\spark-2.4.7-bin-hadoop2.7\bin\spark-submit

浙公网安备 33010602011771号