RPM打包

Mon 2016-04-04 16:07

资料

实践

系统配置准备

# 新建一个docker实例，来测试、学习
[root@cu1 ~]# docker run -ti centos:centos6 /bin/bash

[root@bdc25400cc63 mywget]# cat /etc/redhat-release 
CentOS release 6.6 (Final)

# 安装编译环境所需的软件
yum install which tree lrzsz tar gcc rpm-build
# wget编译的依赖
yum install -y gnutls gnutls-devel

步骤

[root@bdc25400cc63 home]# mkdir mywget 
[root@bdc25400cc63 home]# cd mywget/
[root@bdc25400cc63 mywget]# mkdir BUILD RPMS SOURCES SPECS SRPMS
[root@bdc25400cc63 mywget]# cd SOURCES/
[root@bdc25400cc63 SOURCES]# mv /home/wget-1.17.tar.gz .
[root@bdc25400cc63 SOURCES]# ls
wget-1.17.tar.gz
[root@bdc25400cc63 SOURCES]# cd ..

[root@bdc25400cc63 mywget]# rpmbuild --showrc
[test@bdc25400cc63 mywget]$ rpm --eval "%{_topdir}"

[test@bdc25400cc63 mywget]$ grep -i _topdir /usr/lib/rpm/rpmrc /usr/lib/rpm/redhat/rpmrc /usr/lib/rpm/macros /usr/lib/rpm/redhat/macros  | less
/usr/lib/rpm/macros:%_builddir          %{_topdir}/BUILD
/usr/lib/rpm/macros:%_rpmdir            %{_topdir}/RPMS
/usr/lib/rpm/macros:%_sourcedir         %{_topdir}/SOURCES
/usr/lib/rpm/macros:%_specdir           %{_topdir}/SPECS
/usr/lib/rpm/macros:%_srcrpmdir         %{_topdir}/SRPMS
/usr/lib/rpm/macros:%_buildrootdir              %{_topdir}/BUILDROOT
/usr/lib/rpm/macros:%_topdir            %{getenv:HOME}/rpmbuild

[test@bdc25400cc63 mywget]$ cat ~/.rpmmacros 
%_topdir /home/mywget/rpm

# 2016-5-12 15:28:35
# spec里面有define和global，应该是这个导致的！用global应该即可以了？

[root@bdc25400cc63 mywget]# vi SPECS/wget.spec
  # this is a sample spec file for wget
  
  %define _topdir /home/mywget
  %define name    wget
  %define release 2
  %define version 1.17
  # 定义 _buildrootdir 不起作用，不知道为啥??? 在 .rpmmacros 定义了 %_topdir，root转到 /home/mywget/rpm/BUILDROOT 了。
  
  %define _unpackaged_files_terminate_build 0
  
  Summary:   GNU wget
  License:   GPL
  Name:      %{name}
  Version:   %{version}
  Release:   %{release}
  Source:    %{name}-%{version}.tar.gz
  Prefix:    /usr/local/wget
  Group:     Development/Tools
  
  %description
  The GNU wget program downloads files from the Internet using the command-line.
  
  %prep
  %setup -q
  
  %build
  ./configure
  make
  
  %install
  make install prefix=$RPM_BUILD_ROOT/usr/local/wget # or use DESTDIR=$RPM_BUILD_ROOT
  
  %post
  echo "hello world"
  
  %preun
  echo "bye"
  
  %clean
  rm -rf $RPM_BUILD_ROOT
  
  %files
  %defattr(-, root, root)
  /usr/local/wget/bin/wget
  
[root@bdc25400cc63 mywget]# rpmbuild -vv -bb --clean SPECS/wget.spec 

[root@bdc25400cc63 mywget]# tree .
.
├── BUILD
├── RPMS
│   └── x86_64
│       ├── wget-1.17-2.x86_64.rpm
│       └── wget-debuginfo-1.17-2.x86_64.rpm
├── SOURCES
│   └── wget-1.17.tar.gz
├── SPECS
│   └── wget.spec
└── SRPMS

6 directories, 4 files

[root@bdc25400cc63 mywget]# rpm -qpl RPMS/x86_64/wget-1.17-2.x86_64.rpm  
/usr/local/wget/bin/wget

接下来就可以直接拿到这个包到其他机器上安装了，如果自己建立了本地库，使用createrepo更新下，就可以使用yum安装最新打的包了。

注： %pre , %post 和 %preun , %postun 可以在安装前后执行一些脚本。

[root@cu2 ganglia-build]# mkdir BUILD RPMS SOURCES SPECS SRPMS
[root@cu2 ganglia-build]# cd SOURCES/
[root@cu2 SOURCES]# ll
total 1272
-rw-r--r-- 1 root root 1302320 Jan 20 09:35 ganglia-3.7.2.tar.gz
[root@cu2 SOURCES]# cd ..

[root@cu2 ganglia-build]# ll
total 20
drwxr-xr-x 2 root root 4096 Jun 15 10:25 BUILD
drwxr-xr-x 2 root root 4096 Jun 15 10:25 RPMS
drwxr-xr-x 2 root root 4096 Jun 15 10:25 SOURCES
drwxr-xr-x 2 root root 4096 Jun 15 10:25 SPECS
drwxr-xr-x 2 root root 4096 Jun 15 10:25 SRPMS

[root@cu2 ganglia-build]# cd SPECS/
[root@cu2 SPECS]# vi gmetad.spec

[root@cu2 ganglia-build]# rpmbuild --clean -v -ba SPECS/gmetad.spec 

[root@cu2 ganglia-build]# rpm -qpl RPMS/x86_64/ganglia-3.7.2-1.el6.x86_64.rpm 

重新打包已有rpm

下载源码包，再修改内容，最后使用rpm-build重新打包。

这里以puppetserver为例，使用jdk7即可但官网打包的依赖是jdk8，这里修改依赖然后重新打包：

[root@cu2 rpmbuild]# rpm -ivh puppetserver-2.3.1-1.el6.src.rpm 
warning: puppetserver-2.3.1-1.el6.src.rpm: Header V4 RSA/SHA1 Signature, key ID 4bd6ec30: NOKEY
   1:puppetserver           warning: user mockbuild does not exist - using root
warning: group mockbuild does not exist - using root
########################################### [100%]
warning: user mockbuild does not exist - using root
warning: group mockbuild does not exist - using root
[root@cu2 rpmbuild]# ll
total 32904
-rw-r--r-- 1 root root 33681889 May 10 17:44 puppetserver-2.3.1-1.el6.src.rpm
drwxr-xr-x 2 root root     4096 May 10 17:55 SOURCES
drwxr-xr-x 2 root root     4096 May 10 17:55 SPECS

#-- 注释掉jdk8的部分
[root@cu2 rpmbuild]# grep -3 jdk SPECS/puppetserver.spec 

# java 1.8.0 is available starting in fedora 20 and el 6
#%if 0%{?fedora} >= 20 || 0%{?rhel} >= 6
#%global open_jdk          java-1.8.0-openjdk-headless
#%else
%global open_jdk          java-1.7.0-openjdk
#%endif

[root@cu2 rpmbuild]# yum install -y ruby
[root@cu2 rpmbuild]# rpmbuild -v -bb --clean SPECS/puppetserver.spec 

[root@cu2 rpmbuild]# yum deplist RPMS/noarch/puppetserver-2.3.1-1.el6.noarch.rpm 
Loaded plugins: fastestmirror, priorities
Finding dependencies: 
Loading mirror speeds from cached hostfile
 * base: centos.ustc.edu.cn
 * centosplus: centos.ustc.edu.cn
 * epel: mirror01.idc.hinet.net
 * extras: centos.ustc.edu.cn
 * updates: centos.ustc.edu.cn
193 packages excluded due to repository priority protections
package: puppetserver.noarch 2.3.1-1.el6
  dependency: chkconfig
   provider: chkconfig.x86_64 1.3.49.3-5.el6
   provider: chkconfig.x86_64 1.3.49.3-5.el6_7.2
  dependency: /bin/bash
   provider: bash.x86_64 4.1.2-33.el6
   provider: bash.x86_64 4.1.2-33.el6_7.1
  dependency: java-1.7.0-openjdk
   provider: java-1.7.0-openjdk.x86_64 1:1.7.0.79-2.5.5.4.el6
   provider: java-1.7.0-openjdk.x86_64 1:1.7.0.101-2.6.6.1.el6_7
   provider: java-1.7.0-openjdk.x86_64 1:1.7.0.85-2.6.1.3.el6_6
   provider: java-1.7.0-openjdk.x86_64 1:1.7.0.85-2.6.1.3.el6_7
   provider: java-1.7.0-openjdk.x86_64 1:1.7.0.91-2.6.2.2.el6_7
   provider: java-1.7.0-openjdk.x86_64 1:1.7.0.95-2.6.4.0.el6_7
   provider: java-1.7.0-openjdk.x86_64 1:1.7.0.99-2.6.5.0.el6_7
  dependency: puppet-agent >= 1.4.0
   provider: puppet-agent.x86_64 1.4.1-1.el6
  dependency: net-tools
   provider: net-tools.x86_64 1.60-110.el6_2
  dependency: /usr/bin/env
   provider: coreutils.x86_64 8.4-37.el6
   provider: coreutils.x86_64 8.4-37.el6_7.3
  dependency: /bin/sh
   provider: bash.x86_64 4.1.2-33.el6
   provider: bash.x86_64 4.1.2-33.el6_7.1
  dependency: config(puppetserver) = 2.3.1-1.el6
   provider: puppetserver.noarch 2.3.1-1.el6 

如果仅仅是换个环境不改spec的话，直接用 rpmbuild –rebuild XXX.src.rpm 就可以了。

–END

Parquet学习

Tue 2016-03-29 19:13

经典文章

概念

Row Group
- Column Chunk
  - Page
    - Definition Levels: To support nested records we need to store the level for which the field is null. This is what the definition level is for: from 0 at the root of the schema up to the maximum level for this column. When a field is defined then all its parents are defined too, but when it is null we need to record the level at which it started being null to be able to reconstruct the record.
    - Repetition Levels: To support repeated fields we need to store when new lists are starting in a column of values. This is what repetition level is for: it is the level at which we have to create a new list for the current value. In other words, the repetition level can be seen as a marker of when to start a new list and at which level.
FileMetaData

The definition and repetition levels are optional, based on the schema definition. If the column is not nested (i.e. the path to the column has length 1), we do not encode the repetition levels (it would always have the value 1). For data that is required, the definition levels are skipped (if encoded, it will always have the value of the max definition level).

For example, in the case where the column is non-nested and required, the data in the page is only the encoded values.

An optimized read setup would be: 1GB row groups, 1GB HDFS block size, 1 HDFS block per HDFS file.

texfile转parquet

[hadoop@hadoop-master2 ~]$ cd apache-hive-1.2.1-bin/
[hadoop@hadoop-master2 apache-hive-1.2.1-bin]$ bin/hive
hive> CREATE TABLE `t_ods_access_log2_parquet`(   `houseid` string,    `sourceip` string,    `destinationip` string,    `sourceport` string,    `destinationport` string,    `domain` string,    `url` string,    `accesstime` string,    `logid` string,    `sourceipnum` bigint,    `timedetected` string,    `protocol` string,    `duration` string) ROW FORMAT DELIMITED    FIELDS TERMINATED BY '|'  STORED AS PARQUET LOCATION   '/user/hive/t_ods_access_log2_parquet'

关键 STORED AS PARQUET。

关于压缩，可以通过mapreduce参数设置（ mapreduce.output.fileoutputformat.compress 和 mapreduce.output.fileoutputformat.compress.codec ），但是推荐使用 parquet.compression 属性来指定。

reader/writer都会从 CodecConfig.getCodec() 获取压缩编码。代码中会从parquet属性和mapreduce获取压缩参数。

alter table t_ods_access_log2_parquet SET TBLPROPERTIES ('parquet.compression' = 'SNAPPY' );

create table t_ods_access_log2_parquet_none like t_ods_access_log2_parquet TBLPROPERTIES ('parquet.compression' = 'UNCOMPRESSED' );
create table t_ods_access_log2_parquet_gzip like t_ods_access_log2_parquet TBLPROPERTIES ('parquet.compression' = 'GZIP' );

直接使用hive的insert into语句就可以把原来的textfile的文件转成parquet格式。同时也转成gzip和uncompress比较了一下：

文件格式	压缩	大小
textfile	snappy	4.1G
parquet	snappy	3.6G
parquet	uncompress	7.2G
parquet	gzip	2.2G

直接count整个数据表，使用parquet的输入1M不到数据，太环保了！！（文件都是几十M的，一个文件(block)可以都在一台机器上）。

文件格式	运行引擎	大小
textfile	tez	HDFS_BYTES_READ 4,454,071,542
parquet	tez	HDFS_BYTES_READ 415,870
textfile	sparksql	Input 4.1 GB
parquet	sparksql	Input 384.9 KB

用sparksql跑textfile竟然更快。果然内存大暴力也很牛啊！！

hive> insert into t_ods_access_log2_back select houseid,  sourceip,  destinationip,  sourceport,  destinationport,  domain,  url,  accesstime,  logid,  sourceipnum,  timedetected,  protocol,  duration from t_ods_access_log2 where hour=2016032804 ;
Query ID = hadoop_20160329200414_96f1de35-48c5-4b38-977f-05de8554f388
Total jobs = 1
Launching Job 1 out of 1


Status: Running (Executing on YARN cluster with App id application_1458893800770_3955)

--------------------------------------------------------------------------------
        VERTICES      STATUS  TOTAL  COMPLETED  RUNNING  PENDING  FAILED  KILLED
--------------------------------------------------------------------------------
Map 1 ..........   SUCCEEDED    152        152        0        0       1       0
--------------------------------------------------------------------------------
VERTICES: 01/01  [==========================>>] 100%  ELAPSED TIME: 341.56 s   
--------------------------------------------------------------------------------
Loading data to table default.t_ods_access_log2_back
Table default.t_ods_access_log2_back stats: [numFiles=152, numRows=57688987, totalSize=4454071542, rawDataSize=11018516544]
OK
Time taken: 347.997 seconds
hive> insert into t_ods_access_log2_parquet select houseid,  sourceip,  destinationip,  sourceport,  destinationport,  domain,  url,  accesstime,  logid,  sourceipnum,  timedetected,  protocol,  duration from t_ods_access_log2 where hour=2016032804 ;
Query ID = hadoop_20160329212157_57b66595-5dfc-4fc9-9ad1-398e2b8ade6b
Total jobs = 1
Launching Job 1 out of 1
Tez session was closed. Reopening...
Session re-established.


Status: Running (Executing on YARN cluster with App id application_1458893800770_3992)

--------------------------------------------------------------------------------
        VERTICES      STATUS  TOTAL  COMPLETED  RUNNING  PENDING  FAILED  KILLED
--------------------------------------------------------------------------------
Map 1 ..........   SUCCEEDED    152        152        0        0       0       0
--------------------------------------------------------------------------------
VERTICES: 01/01  [==========================>>] 100%  ELAPSED TIME: 237.28 s   
--------------------------------------------------------------------------------
Loading data to table default.t_ods_access_log2_parquet
Table default.t_ods_access_log2_parquet stats: [numFiles=0, numRows=1305035789, totalSize=0, rawDataSize=16965465257]
OK
Time taken: 260.515 seconds
hive> select count(*) from t_ods_access_log2_back;
Query ID = hadoop_20160329212644_da8e7997-5bcc-41ab-8b63-f1a5919c5a2f
Total jobs = 1
Launching Job 1 out of 1


Status: Running (Executing on YARN cluster with App id application_1458893800770_3992)

--------------------------------------------------------------------------------
        VERTICES      STATUS  TOTAL  COMPLETED  RUNNING  PENDING  FAILED  KILLED
--------------------------------------------------------------------------------
Map 1 ..........   SUCCEEDED    107        107        0        0       0       0
Reducer 2 ......   SUCCEEDED      1          1        0        0       0       0
--------------------------------------------------------------------------------
VERTICES: 02/02  [==========================>>] 100%  ELAPSED TIME: 59.01 s    
--------------------------------------------------------------------------------
OK
57688987
Time taken: 59.768 seconds, Fetched: 1 row(s)
hive> select count(*) from t_ods_access_log2_parquet;
Query ID = hadoop_20160329212813_2fb8dafa-5c9a-40e8-a904-13e7cf865ec6
Total jobs = 1
Launching Job 1 out of 1


Status: Running (Executing on YARN cluster with App id application_1458893800770_3992)

--------------------------------------------------------------------------------
        VERTICES      STATUS  TOTAL  COMPLETED  RUNNING  PENDING  FAILED  KILLED
--------------------------------------------------------------------------------
Map 1 ..........   SUCCEEDED    106        106        0        0       0       0
Reducer 2 ......   SUCCEEDED      1          1        0        0       0       0
--------------------------------------------------------------------------------
VERTICES: 02/02  [==========================>>] 100%  ELAPSED TIME: 45.82 s    
--------------------------------------------------------------------------------
OK
57688987
Time taken: 47.275 seconds, Fetched: 1 row(s)

hive> set hive.execution.engine=spark;
hive> set spark.master=yarn-client;
hive> select count(*) from t_ods_access_log2_back;
Query ID = hadoop_20160329214550_a58d1056-9c91-4bbe-be7d-122ec3efdd8d
Total jobs = 1
Launching Job 1 out of 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Spark Job = 3a03d432-83a4-4d5a-a878-c9e52aa94bed

Query Hive on Spark job[0] stages:
0
1

Status: Running (Hive on Spark job[0])
Job Progress Format
CurrentTime StageId_StageAttemptId: SucceededTasksCount(+RunningTasksCount-FailedTasksCount)/TotalTasksCount [StageCost]
2016-03-29 21:46:26,523 Stage-0_0: 0(+114)/152  Stage-1_0: 0/1
2016-03-29 21:46:27,535 Stage-0_0: 0(+115)/152  Stage-1_0: 0/1
2016-03-29 21:46:30,563 Stage-0_0: 0(+115)/152  Stage-1_0: 0/1
2016-03-29 21:46:33,582 Stage-0_0: 0(+115)/152  Stage-1_0: 0/1
2016-03-29 21:46:36,606 Stage-0_0: 0(+115)/152  Stage-1_0: 0/1
2016-03-29 21:46:39,624 Stage-0_0: 0(+115)/152  Stage-1_0: 0/1
2016-03-29 21:46:41,637 Stage-0_0: 0(+118)/152  Stage-1_0: 0/1
2016-03-29 21:46:42,644 Stage-0_0: 4(+115)/152  Stage-1_0: 0/1
2016-03-29 21:46:43,651 Stage-0_0: 110(+41)/152 Stage-1_0: 0/1
2016-03-29 21:46:44,658 Stage-0_0: 124(+28)/152 Stage-1_0: 0/1
2016-03-29 21:46:45,665 Stage-0_0: 128(+24)/152 Stage-1_0: 0/1
2016-03-29 21:46:46,671 Stage-0_0: 138(+14)/152 Stage-1_0: 0/1
2016-03-29 21:46:47,677 Stage-0_0: 142(+10)/152 Stage-1_0: 0/1
2016-03-29 21:46:48,684 Stage-0_0: 144(+8)/152  Stage-1_0: 0/1
2016-03-29 21:46:49,691 Stage-0_0: 147(+5)/152  Stage-1_0: 0/1
2016-03-29 21:46:50,698 Stage-0_0: 148(+4)/152  Stage-1_0: 0/1
2016-03-29 21:46:51,705 Stage-0_0: 149(+3)/152  Stage-1_0: 0/1
2016-03-29 21:46:52,712 Stage-0_0: 150(+2)/152  Stage-1_0: 0/1
2016-03-29 21:46:55,731 Stage-0_0: 151(+1)/152  Stage-1_0: 0/1
2016-03-29 21:46:58,750 Stage-0_0: 151(+1)/152  Stage-1_0: 0/1
2016-03-29 21:47:01,769 Stage-0_0: 151(+1)/152  Stage-1_0: 0/1
2016-03-29 21:47:02,776 Stage-0_0: 152/152 Finished     Stage-1_0: 0(+1)/1
2016-03-29 21:47:05,793 Stage-0_0: 152/152 Finished     Stage-1_0: 1/1 Finished
Status: Finished successfully in 70.33 seconds
OK
57688987
Time taken: 75.211 seconds, Fetched: 1 row(s)
hive> select count(*) from t_ods_access_log2_back;
Query ID = hadoop_20160329214723_9663eaf7-7014-46b1-b2ca-811ba64fc55c
Total jobs = 1
Launching Job 1 out of 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Spark Job = f2dbcd55-b23c-4eb3-9439-8f1c825fbac3

Query Hive on Spark job[1] stages:
2
3

Status: Running (Hive on Spark job[1])
Job Progress Format
CurrentTime StageId_StageAttemptId: SucceededTasksCount(+RunningTasksCount-FailedTasksCount)/TotalTasksCount [StageCost]
2016-03-29 21:47:24,449 Stage-2_0: 0(+122)/152  Stage-3_0: 0/1
2016-03-29 21:47:25,455 Stage-2_0: 96(+56)/152  Stage-3_0: 0/1
2016-03-29 21:47:26,462 Stage-2_0: 123(+29)/152 Stage-3_0: 0/1
2016-03-29 21:47:27,469 Stage-2_0: 128(+24)/152 Stage-3_0: 0/1
2016-03-29 21:47:28,476 Stage-2_0: 132(+20)/152 Stage-3_0: 0/1
2016-03-29 21:47:29,483 Stage-2_0: 137(+15)/152 Stage-3_0: 0/1
2016-03-29 21:47:30,489 Stage-2_0: 145(+7)/152  Stage-3_0: 0/1
2016-03-29 21:47:31,495 Stage-2_0: 146(+6)/152  Stage-3_0: 0/1
2016-03-29 21:47:32,500 Stage-2_0: 150(+2)/152  Stage-3_0: 0/1
2016-03-29 21:47:33,506 Stage-2_0: 152/152 Finished     Stage-3_0: 0(+1)/1
2016-03-29 21:47:36,524 Stage-2_0: 152/152 Finished     Stage-3_0: 0(+1)/1
2016-03-29 21:47:39,540 Stage-2_0: 152/152 Finished     Stage-3_0: 0(+1)/1
2016-03-29 21:47:42,557 Stage-2_0: 152/152 Finished     Stage-3_0: 0(+1)/1
2016-03-29 21:47:45,573 Stage-2_0: 152/152 Finished     Stage-3_0: 0(+1)/1
2016-03-29 21:47:48,589 Stage-2_0: 152/152 Finished     Stage-3_0: 0(+1)/1
2016-03-29 21:47:49,594 Stage-2_0: 152/152 Finished     Stage-3_0: 1/1 Finished
Status: Finished successfully in 26.15 seconds
OK
57688987
Time taken: 26.392 seconds, Fetched: 1 row(s)
hive> select count(*) from t_ods_access_log2_parquet;
Query ID = hadoop_20160329214758_25084e25-fdaf-4ef8-9c1a-2573515caca6
Total jobs = 1
Launching Job 1 out of 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Spark Job = 4360be5c-4188-49c4-a2a7-e5bb80164646

Query Hive on Spark job[2] stages:
5
4

Status: Running (Hive on Spark job[2])
Job Progress Format
CurrentTime StageId_StageAttemptId: SucceededTasksCount(+RunningTasksCount-FailedTasksCount)/TotalTasksCount [StageCost]
2016-03-29 21:47:59,472 Stage-4_0: 0(+63)/65    Stage-5_0: 0/1
2016-03-29 21:48:00,478 Stage-4_0: 1(+62)/65    Stage-5_0: 0/1
2016-03-29 21:48:01,486 Stage-4_0: 49(+14)/65   Stage-5_0: 0/1
2016-03-29 21:48:02,492 Stage-4_0: 51(+14)/65   Stage-5_0: 0/1
2016-03-29 21:48:03,498 Stage-4_0: 57(+8)/65    Stage-5_0: 0/1
2016-03-29 21:48:04,505 Stage-4_0: 62(+3)/65    Stage-5_0: 0/1
2016-03-29 21:48:05,511 Stage-4_0: 63(+2)/65    Stage-5_0: 0/1
2016-03-29 21:48:06,518 Stage-4_0: 65/65 Finished       Stage-5_0: 0(+1)/1
2016-03-29 21:48:09,537 Stage-4_0: 65/65 Finished       Stage-5_0: 0(+1)/1
2016-03-29 21:48:12,556 Stage-4_0: 65/65 Finished       Stage-5_0: 0(+1)/1
2016-03-29 21:48:15,574 Stage-4_0: 65/65 Finished       Stage-5_0: 0(+1)/1
2016-03-29 21:48:18,592 Stage-4_0: 65/65 Finished       Stage-5_0: 0(+1)/1
2016-03-29 21:48:21,608 Stage-4_0: 65/65 Finished       Stage-5_0: 1/1 Finished
Status: Finished successfully in 23.14 seconds
OK
57688987
Time taken: 23.376 seconds, Fetched: 1 row(s)
hive> select count(*) from t_ods_access_log2_parquet;
Query ID = hadoop_20160329214826_173311b1-0083-4e11-9a29-fe13f48bb649
Total jobs = 1
Launching Job 1 out of 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Spark Job = c452b02b-c68f-4c68-bc28-cb9748d7dcb2

Query Hive on Spark job[3] stages:
6
7

Status: Running (Hive on Spark job[3])
Job Progress Format
CurrentTime StageId_StageAttemptId: SucceededTasksCount(+RunningTasksCount-FailedTasksCount)/TotalTasksCount [StageCost]
2016-03-29 21:48:27,332 Stage-6_0: 3(+60)/65    Stage-7_0: 0/1
2016-03-29 21:48:28,338 Stage-6_0: 53(+10)/65   Stage-7_0: 0/1
2016-03-29 21:48:29,343 Stage-6_0: 60(+3)/65    Stage-7_0: 0/1
2016-03-29 21:48:30,349 Stage-6_0: 61(+4)/65    Stage-7_0: 0/1
2016-03-29 21:48:31,354 Stage-6_0: 63(+2)/65    Stage-7_0: 0/1
2016-03-29 21:48:32,360 Stage-6_0: 65/65 Finished       Stage-7_0: 0(+1)/1
2016-03-29 21:48:35,377 Stage-6_0: 65/65 Finished       Stage-7_0: 0(+1)/1
2016-03-29 21:48:38,393 Stage-6_0: 65/65 Finished       Stage-7_0: 0(+1)/1
2016-03-29 21:48:40,404 Stage-6_0: 65/65 Finished       Stage-7_0: 1/1 Finished
Status: Finished successfully in 14.08 seconds
OK
57688987
Time taken: 14.306 seconds, Fetched: 1 row(s)


[hadoop@hadoop-master2 spark-1.6.0-bin-2.6.3]$ bin/spark-sql --master yarn-client --hiveconf hive.execution.engine=mr 
         > select count(*) from t_ods_access_log2_parquet;
57688987
16/03/29 22:19:51 INFO CliDriver: Time taken: 21.82 seconds, Fetched 1 row(s)

         > select count(*) from t_ods_access_log2_back;
57688987
16/03/29 22:20:44 INFO CliDriver: Time taken: 6.634 seconds, Fetched 1 row(s)

–END

Limit on Sparksql and Hive

Tue 2016-03-29 15:27

前一篇提到sparksql查询limit的时刻会提前返回，不需要查询所有的数据。hive是死算，sparksql递增数据量的一次次的试。sparksql可以这么做的，毕竟算好的数据在内存里面放着。

把日志记录下面：

hive1.2.1-on-spark1.3.1

hive> select houseid,  sourceip,  destinationip,  sourceport,  destinationport,  domain,  url,  accesstime,  logid,  sourceipnum,  timedetected,  protocol,  duration from t_ods_access_log2 where hour=2016032804 and sourceip='118.112.188.17' limit 10;
Query ID = hadoop_20160329151420_25fe9497-e223-4f48-980e-e7fe859848ce
Total jobs = 1
Launching Job 1 out of 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Spark Job = 9036c8d7-62b6-4b9a-b6d3-2d8b5eed6bf9

Query Hive on Spark job[2] stages:
3

Status: Running (Hive on Spark job[2])
Job Progress Format
CurrentTime StageId_StageAttemptId: SucceededTasksCount(+RunningTasksCount-FailedTasksCount)/TotalTasksCount [StageCost]
2016-03-29 15:14:22,053 Stage-3_0: 0(+160)/942
2016-03-29 15:14:23,059 Stage-3_0: 47(+160)/942
2016-03-29 15:14:24,064 Stage-3_0: 131(+160)/942
2016-03-29 15:14:25,069 Stage-3_0: 266(+160)/942
2016-03-29 15:14:26,075 Stage-3_0: 382(+160)/942
2016-03-29 15:14:27,080 Stage-3_0: 497(+152)/942
2016-03-29 15:14:28,085 Stage-3_0: 607(+142)/942
2016-03-29 15:14:29,090 Stage-3_0: 714(+125)/942
2016-03-29 15:14:30,094 Stage-3_0: 794(+91)/942
2016-03-29 15:14:31,099 Stage-3_0: 846(+61)/942
2016-03-29 15:14:32,103 Stage-3_0: 868(+47)/942
2016-03-29 15:14:33,107 Stage-3_0: 886(+35)/942
2016-03-29 15:14:34,112 Stage-3_0: 895(+26)/942
2016-03-29 15:14:35,116 Stage-3_0: 902(+21)/942
2016-03-29 15:14:36,120 Stage-3_0: 904(+19)/942
2016-03-29 15:14:37,124 Stage-3_0: 906(+17)/942
2016-03-29 15:14:38,128 Stage-3_0: 910(+15)/942
2016-03-29 15:14:39,132 Stage-3_0: 914(+13)/942
2016-03-29 15:14:40,137 Stage-3_0: 920(+9)/942
2016-03-29 15:14:41,141 Stage-3_0: 921(+8)/942
2016-03-29 15:14:44,155 Stage-3_0: 928(+14)/942
2016-03-29 15:14:45,159 Stage-3_0: 934(+8)/942
2016-03-29 15:14:46,164 Stage-3_0: 936(+6)/942
2016-03-29 15:14:47,169 Stage-3_0: 937(+5)/942
2016-03-29 15:14:50,180 Stage-3_0: 938(+4)/942
2016-03-29 15:14:52,188 Stage-3_0: 939(+3)/942
2016-03-29 15:14:54,196 Stage-3_0: 941(+1)/942
2016-03-29 15:14:57,206 Stage-3_0: 941(+1)/942
2016-03-29 15:15:00,215 Stage-3_0: 942/942 Finished
Status: Finished successfully in 39.17 seconds

sparksql1.6.0

spark-sql> select houseid,  sourceip,  destinationip,  sourceport,  destinationport,  domain,  url,  accesstime,  logid,  sourceipnum,  timedetected,  protocol,  duration from t_ods_access_log2 where hour=2016032804 and sourceip='118.112.188.17' limit 10;
16/03/29 15:15:16 INFO parse.ParseDriver: Parsing command: select houseid,  sourceip,  destinationip,  sourceport,  destinationport,  domain,  url,  accesstime,  logid,  sourceipnum,  timedetected,  protocol,  duration from t_ods_access_log2 where hour=2016032804 and sourceip='118.112.188.17' limit 10
16/03/29 15:15:16 INFO parse.ParseDriver: Parse Completed
16/03/29 15:15:16 INFO metastore.HiveMetaStore: 0: get_table : db=default tbl=t_ods_access_log2
16/03/29 15:15:16 INFO HiveMetaStore.audit: ugi=hadoop  ip=unknown-ip-addr      cmd=get_table : db=default tbl=t_ods_access_log2
16/03/29 15:15:17 INFO storage.MemoryStore: Block broadcast_6 stored as values in memory (estimated size 543.3 KB, free 9.7 MB)
16/03/29 15:15:17 INFO storage.MemoryStore: Block broadcast_6_piece0 stored as bytes in memory (estimated size 44.1 KB, free 9.8 MB)
16/03/29 15:15:17 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on 192.168.32.12:51590 (size: 44.1 KB, free: 21.3 GB)
16/03/29 15:15:17 INFO spark.SparkContext: Created broadcast 6 from processCmd at CliDriver.java:376
16/03/29 15:15:17 INFO metastore.HiveMetaStore: 0: get_partitions : db=default tbl=t_ods_access_log2
16/03/29 15:15:17 INFO HiveMetaStore.audit: ugi=hadoop  ip=unknown-ip-addr      cmd=get_partitions : db=default tbl=t_ods_access_log2
16/03/29 15:15:18 INFO mapred.FileInputFormat: Total input paths to process : 942
16/03/29 15:15:18 INFO spark.SparkContext: Starting job: processCmd at CliDriver.java:376
16/03/29 15:15:18 INFO scheduler.DAGScheduler: Got job 4 (processCmd at CliDriver.java:376) with 1 output partitions
16/03/29 15:15:18 INFO scheduler.DAGScheduler: Final stage: ResultStage 5 (processCmd at CliDriver.java:376)
16/03/29 15:15:18 INFO scheduler.DAGScheduler: Parents of final stage: List()
16/03/29 15:15:18 INFO scheduler.DAGScheduler: Missing parents: List()
16/03/29 15:15:18 INFO scheduler.DAGScheduler: Submitting ResultStage 5 (MapPartitionsRDD[18] at processCmd at CliDriver.java:376), which has no missing parents
16/03/29 15:15:18 INFO storage.MemoryStore: Block broadcast_7 stored as values in memory (estimated size 3.9 MB, free 13.7 MB)
16/03/29 15:15:18 INFO storage.MemoryStore: Block broadcast_7_piece0 stored as bytes in memory (estimated size 318.8 KB, free 14.0 MB)
16/03/29 15:15:18 INFO storage.BlockManagerInfo: Added broadcast_7_piece0 in memory on 192.168.32.12:51590 (size: 318.8 KB, free: 21.3 GB)
16/03/29 15:15:18 INFO spark.SparkContext: Created broadcast 7 from broadcast at DAGScheduler.scala:1006
16/03/29 15:15:18 INFO scheduler.DAGScheduler: Submitting 1 missing tasks from ResultStage 5 (MapPartitionsRDD[18] at processCmd at CliDriver.java:376)
16/03/29 15:15:18 INFO cluster.YarnScheduler: Adding task set 5.0 with 1 tasks
16/03/29 15:15:18 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 5.0 (TID 1260, hadoop-slaver135, partition 0,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:19 INFO storage.BlockManagerInfo: Added broadcast_7_piece0 in memory on hadoop-slaver135:59376 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:20 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver135:59376 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:21 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 5.0 (TID 1260) in 3273 ms on hadoop-slaver135 (1/1)
16/03/29 15:15:21 INFO cluster.YarnScheduler: Removed TaskSet 5.0, whose tasks have all completed, from pool 
16/03/29 15:15:21 INFO scheduler.DAGScheduler: ResultStage 5 (processCmd at CliDriver.java:376) finished in 3.276 s
16/03/29 15:15:21 INFO scheduler.DAGScheduler: Job 4 finished: processCmd at CliDriver.java:376, took 3.475462 s
16/03/29 15:15:21 INFO scheduler.StatsReportListener: Finished stage: org.apache.spark.scheduler.StageInfo@57e08525
16/03/29 15:15:21 INFO spark.SparkContext: Starting job: processCmd at CliDriver.java:376
16/03/29 15:15:21 INFO scheduler.StatsReportListener: task runtime:(count: 1, mean: 3273.000000, stdev: 0.000000, max: 3273.000000, min: 3273.000000)
16/03/29 15:15:21 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:21 INFO scheduler.StatsReportListener:   3.3 s   3.3 s   3.3 s   3.3 s   3.3 s   3.3 s   3.3 s   3.3 s   3.3 s
16/03/29 15:15:21 INFO scheduler.DAGScheduler: Got job 5 (processCmd at CliDriver.java:376) with 2 output partitions
16/03/29 15:15:21 INFO scheduler.DAGScheduler: Final stage: ResultStage 6 (processCmd at CliDriver.java:376)
16/03/29 15:15:21 INFO scheduler.DAGScheduler: Parents of final stage: List()
16/03/29 15:15:21 INFO scheduler.StatsReportListener: task result size:(count: 1, mean: 3763.000000, stdev: 0.000000, max: 3763.000000, min: 3763.000000)
16/03/29 15:15:21 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:21 INFO scheduler.StatsReportListener:   3.7 KB  3.7 KB  3.7 KB  3.7 KB  3.7 KB  3.7 KB  3.7 KB  3.7 KB  3.7 KB
16/03/29 15:15:21 INFO scheduler.DAGScheduler: Missing parents: List()
16/03/29 15:15:21 INFO scheduler.DAGScheduler: Submitting ResultStage 6 (MapPartitionsRDD[18] at processCmd at CliDriver.java:376), which has no missing parents
16/03/29 15:15:21 INFO scheduler.StatsReportListener: executor (non-fetch) time pct: (count: 1, mean: 51.879010, stdev: 0.000000, max: 51.879010, min: 51.879010)
16/03/29 15:15:21 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:21 INFO scheduler.StatsReportListener:   52 %    52 %    52 %    52 %    52 %    52 %    52 %    52 %    52 %
16/03/29 15:15:21 INFO scheduler.StatsReportListener: other time pct: (count: 1, mean: 48.120990, stdev: 0.000000, max: 48.120990, min: 48.120990)
16/03/29 15:15:21 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:21 INFO scheduler.StatsReportListener:   48 %    48 %    48 %    48 %    48 %    48 %    48 %    48 %    48 %
16/03/29 15:15:21 INFO storage.MemoryStore: Block broadcast_8 stored as values in memory (estimated size 3.9 MB, free 17.9 MB)
16/03/29 15:15:21 INFO storage.MemoryStore: Block broadcast_8_piece0 stored as bytes in memory (estimated size 318.8 KB, free 18.2 MB)
16/03/29 15:15:21 INFO storage.BlockManagerInfo: Added broadcast_8_piece0 in memory on 192.168.32.12:51590 (size: 318.8 KB, free: 21.3 GB)
16/03/29 15:15:21 INFO spark.SparkContext: Created broadcast 8 from broadcast at DAGScheduler.scala:1006
16/03/29 15:15:21 INFO scheduler.DAGScheduler: Submitting 2 missing tasks from ResultStage 6 (MapPartitionsRDD[18] at processCmd at CliDriver.java:376)
16/03/29 15:15:21 INFO cluster.YarnScheduler: Adding task set 6.0 with 2 tasks
16/03/29 15:15:21 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 6.0 (TID 1261, hadoop-slaver67, partition 1,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:21 INFO scheduler.TaskSetManager: Starting task 1.0 in stage 6.0 (TID 1262, hadoop-slaver121, partition 2,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:21 INFO storage.BlockManagerInfo: Added broadcast_8_piece0 in memory on hadoop-slaver67:49600 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:22 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver67:49600 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:22 INFO storage.BlockManagerInfo: Added broadcast_8_piece0 in memory on hadoop-slaver121:57614 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:22 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver121:57614 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:22 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 6.0 (TID 1261) in 930 ms on hadoop-slaver67 (1/2)
16/03/29 15:15:23 INFO scheduler.TaskSetManager: Finished task 1.0 in stage 6.0 (TID 1262) in 1207 ms on hadoop-slaver121 (2/2)
16/03/29 15:15:23 INFO cluster.YarnScheduler: Removed TaskSet 6.0, whose tasks have all completed, from pool 
16/03/29 15:15:23 INFO scheduler.DAGScheduler: ResultStage 6 (processCmd at CliDriver.java:376) finished in 1.210 s
16/03/29 15:15:23 INFO scheduler.DAGScheduler: Job 5 finished: processCmd at CliDriver.java:376, took 1.378783 s
16/03/29 15:15:23 INFO scheduler.StatsReportListener: Finished stage: org.apache.spark.scheduler.StageInfo@573e5329
16/03/29 15:15:23 INFO spark.SparkContext: Starting job: processCmd at CliDriver.java:376
16/03/29 15:15:23 INFO scheduler.StatsReportListener: task runtime:(count: 2, mean: 1068.500000, stdev: 138.500000, max: 1207.000000, min: 930.000000)
16/03/29 15:15:23 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:23 INFO scheduler.StatsReportListener:   930.0 ms        930.0 ms        930.0 ms        930.0 ms        1.2 s   1.2 s   1.2 s   1.2 s   1.2 s
16/03/29 15:15:23 INFO scheduler.DAGScheduler: Got job 6 (processCmd at CliDriver.java:376) with 7 output partitions
16/03/29 15:15:23 INFO scheduler.DAGScheduler: Final stage: ResultStage 7 (processCmd at CliDriver.java:376)
16/03/29 15:15:23 INFO scheduler.DAGScheduler: Parents of final stage: List()
16/03/29 15:15:23 INFO scheduler.StatsReportListener: task result size:(count: 2, mean: 2267.500000, stdev: 0.500000, max: 2268.000000, min: 2267.000000)
16/03/29 15:15:23 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:23 INFO scheduler.StatsReportListener:   2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB
16/03/29 15:15:23 INFO scheduler.DAGScheduler: Missing parents: List()
16/03/29 15:15:23 INFO scheduler.StatsReportListener: executor (non-fetch) time pct: (count: 2, mean: 73.649411, stdev: 11.511880, max: 85.161290, min: 62.137531)
16/03/29 15:15:23 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:23 INFO scheduler.StatsReportListener:   62 %    62 %    62 %    62 %    85 %    85 %    85 %    85 %    85 %
16/03/29 15:15:23 INFO scheduler.DAGScheduler: Submitting ResultStage 7 (MapPartitionsRDD[18] at processCmd at CliDriver.java:376), which has no missing parents
16/03/29 15:15:23 INFO scheduler.StatsReportListener: other time pct: (count: 2, mean: 26.350589, stdev: 11.511880, max: 37.862469, min: 14.838710)
16/03/29 15:15:23 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:23 INFO scheduler.StatsReportListener:   15 %    15 %    15 %    15 %    38 %    38 %    38 %    38 %    38 %
16/03/29 15:15:23 INFO storage.MemoryStore: Block broadcast_9 stored as values in memory (estimated size 3.9 MB, free 22.1 MB)
16/03/29 15:15:23 INFO storage.MemoryStore: Block broadcast_9_piece0 stored as bytes in memory (estimated size 318.8 KB, free 22.4 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_9_piece0 in memory on 192.168.32.12:51590 (size: 318.8 KB, free: 21.3 GB)
16/03/29 15:15:23 INFO spark.SparkContext: Created broadcast 9 from broadcast at DAGScheduler.scala:1006
16/03/29 15:15:23 INFO scheduler.DAGScheduler: Submitting 7 missing tasks from ResultStage 7 (MapPartitionsRDD[18] at processCmd at CliDriver.java:376)
16/03/29 15:15:23 INFO cluster.YarnScheduler: Adding task set 7.0 with 7 tasks
16/03/29 15:15:23 INFO scheduler.TaskSetManager: Starting task 6.0 in stage 7.0 (TID 1263, hadoop-slaver158, partition 9,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:23 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 7.0 (TID 1264, hadoop-slaver82, partition 3,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:23 INFO scheduler.TaskSetManager: Starting task 5.0 in stage 7.0 (TID 1265, hadoop-slaver68, partition 8,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:23 INFO scheduler.TaskSetManager: Starting task 1.0 in stage 7.0 (TID 1266, hadoop-slaver120, partition 4,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:23 INFO scheduler.TaskSetManager: Starting task 2.0 in stage 7.0 (TID 1267, hadoop-slaver14, partition 5,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:23 INFO scheduler.TaskSetManager: Starting task 4.0 in stage 7.0 (TID 1268, hadoop-slaver137, partition 7,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:23 INFO scheduler.TaskSetManager: Starting task 3.0 in stage 7.0 (TID 1269, hadoop-slaver70, partition 6,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_9_piece0 in memory on hadoop-slaver68:45281 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_9_piece0 in memory on hadoop-slaver70:34080 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_9_piece0 in memory on hadoop-slaver137:45760 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_9_piece0 in memory on hadoop-slaver82:36935 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_9_piece0 in memory on hadoop-slaver158:39852 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_9_piece0 in memory on hadoop-slaver14:40126 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_9_piece0 in memory on hadoop-slaver120:46667 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver68:45281 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver120:46667 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver70:34080 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver82:36935 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver14:40126 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver137:45760 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:23 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver158:39852 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:24 INFO scheduler.TaskSetManager: Finished task 1.0 in stage 7.0 (TID 1266) in 780 ms on hadoop-slaver120 (1/7)
16/03/29 15:15:24 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 7.0 (TID 1264) in 943 ms on hadoop-slaver82 (2/7)
16/03/29 15:15:24 INFO scheduler.TaskSetManager: Finished task 5.0 in stage 7.0 (TID 1265) in 999 ms on hadoop-slaver68 (3/7)
16/03/29 15:15:24 INFO scheduler.TaskSetManager: Finished task 3.0 in stage 7.0 (TID 1269) in 1047 ms on hadoop-slaver70 (4/7)
16/03/29 15:15:24 INFO scheduler.TaskSetManager: Finished task 4.0 in stage 7.0 (TID 1268) in 1123 ms on hadoop-slaver137 (5/7)
16/03/29 15:15:24 INFO scheduler.TaskSetManager: Finished task 2.0 in stage 7.0 (TID 1267) in 1413 ms on hadoop-slaver14 (6/7)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Finished task 6.0 in stage 7.0 (TID 1263) in 2229 ms on hadoop-slaver158 (7/7)
16/03/29 15:15:25 INFO cluster.YarnScheduler: Removed TaskSet 7.0, whose tasks have all completed, from pool 
16/03/29 15:15:25 INFO scheduler.DAGScheduler: ResultStage 7 (processCmd at CliDriver.java:376) finished in 2.231 s
16/03/29 15:15:25 INFO scheduler.DAGScheduler: Job 6 finished: processCmd at CliDriver.java:376, took 2.399044 s
16/03/29 15:15:25 INFO scheduler.StatsReportListener: Finished stage: org.apache.spark.scheduler.StageInfo@5210a024
16/03/29 15:15:25 INFO spark.SparkContext: Starting job: processCmd at CliDriver.java:376
16/03/29 15:15:25 INFO scheduler.StatsReportListener: task runtime:(count: 7, mean: 1219.142857, stdev: 449.417537, max: 2229.000000, min: 780.000000)
16/03/29 15:15:25 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:25 INFO scheduler.StatsReportListener:   780.0 ms        780.0 ms        780.0 ms        943.0 ms        1.0 s   1.4 s   2.2 s   2.2 s   2.2 s
16/03/29 15:15:25 INFO scheduler.StatsReportListener: task result size:(count: 7, mean: 2267.428571, stdev: 0.494872, max: 2268.000000, min: 2267.000000)
16/03/29 15:15:25 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:25 INFO scheduler.StatsReportListener:   2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB
16/03/29 15:15:25 INFO scheduler.DAGScheduler: Got job 7 (processCmd at CliDriver.java:376) with 25 output partitions
16/03/29 15:15:25 INFO scheduler.DAGScheduler: Final stage: ResultStage 8 (processCmd at CliDriver.java:376)
16/03/29 15:15:25 INFO scheduler.DAGScheduler: Parents of final stage: List()
16/03/29 15:15:25 INFO scheduler.StatsReportListener: executor (non-fetch) time pct: (count: 7, mean: 83.082955, stdev: 4.773503, max: 92.418125, min: 77.114871)
16/03/29 15:15:25 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:25 INFO scheduler.StatsReportListener:   77 %    77 %    77 %    78 %    83 %    86 %    92 %    92 %    92 %
16/03/29 15:15:25 INFO scheduler.DAGScheduler: Missing parents: List()
16/03/29 15:15:25 INFO scheduler.StatsReportListener: other time pct: (count: 7, mean: 16.917045, stdev: 4.773503, max: 22.885129, min: 7.581875)
16/03/29 15:15:25 INFO scheduler.DAGScheduler: Submitting ResultStage 8 (MapPartitionsRDD[18] at processCmd at CliDriver.java:376), which has no missing parents
16/03/29 15:15:25 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:25 INFO scheduler.StatsReportListener:    8 %     8 %     8 %    14 %    17 %    22 %    23 %    23 %    23 %
16/03/29 15:15:25 INFO storage.MemoryStore: Block broadcast_10 stored as values in memory (estimated size 3.9 MB, free 26.3 MB)
16/03/29 15:15:25 INFO storage.MemoryStore: Block broadcast_10_piece0 stored as bytes in memory (estimated size 318.8 KB, free 26.6 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on 192.168.32.12:51590 (size: 318.8 KB, free: 21.3 GB)
16/03/29 15:15:25 INFO spark.SparkContext: Created broadcast 10 from broadcast at DAGScheduler.scala:1006
16/03/29 15:15:25 INFO scheduler.DAGScheduler: Submitting 25 missing tasks from ResultStage 8 (MapPartitionsRDD[18] at processCmd at CliDriver.java:376)
16/03/29 15:15:25 INFO cluster.YarnScheduler: Adding task set 8.0 with 25 tasks
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 19.0 in stage 8.0 (TID 1270, hadoop-slaver61, partition 29,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 2.0 in stage 8.0 (TID 1271, hadoop-slaver100, partition 12,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 9.0 in stage 8.0 (TID 1272, hadoop-slaver34, partition 19,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 10.0 in stage 8.0 (TID 1273, hadoop-slaver76, partition 20,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 14.0 in stage 8.0 (TID 1274, hadoop-slaver84, partition 24,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 17.0 in stage 8.0 (TID 1275, hadoop-slaver96, partition 27,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 4.0 in stage 8.0 (TID 1276, hadoop-slaver38, partition 14,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 13.0 in stage 8.0 (TID 1277, hadoop-slaver11, partition 23,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 15.0 in stage 8.0 (TID 1278, hadoop-slaver98, partition 25,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 1.0 in stage 8.0 (TID 1279, hadoop-slaver136, partition 11,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 7.0 in stage 8.0 (TID 1280, hadoop-slaver44, partition 17,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 20.0 in stage 8.0 (TID 1281, hadoop-slaver120, partition 30,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 11.0 in stage 8.0 (TID 1282, hadoop-slaver141, partition 21,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 23.0 in stage 8.0 (TID 1283, hadoop-slaver82, partition 33,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 24.0 in stage 8.0 (TID 1284, hadoop-slaver159, partition 34,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 18.0 in stage 8.0 (TID 1285, hadoop-slaver15, partition 28,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 6.0 in stage 8.0 (TID 1286, hadoop-slaver1, partition 16,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 8.0 in stage 8.0 (TID 1287, hadoop-slaver145, partition 18,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 22.0 in stage 8.0 (TID 1288, hadoop-slaver142, partition 32,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 16.0 in stage 8.0 (TID 1289, hadoop-slaver31, partition 26,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 5.0 in stage 8.0 (TID 1290, hadoop-slaver75, partition 15,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 12.0 in stage 8.0 (TID 1291, hadoop-slaver97, partition 22,NODE_LOCAL, 2354 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 21.0 in stage 8.0 (TID 1292, hadoop-slaver149, partition 31,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 0.0 in stage 8.0 (TID 1293, hadoop-slaver163, partition 10,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO scheduler.TaskSetManager: Starting task 3.0 in stage 8.0 (TID 1294, hadoop-slaver91, partition 13,NODE_LOCAL, 2355 bytes)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver34:54432 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver120:46667 (size: 318.8 KB, free: 510.0 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver15:58396 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver82:36935 (size: 318.8 KB, free: 510.0 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver31:37685 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver1:38813 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver100:56851 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver61:37705 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver98:60144 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver38:57228 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver76:40021 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver44:37682 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver149:59628 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver159:40160 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver11:44070 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver91:47206 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver75:50788 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver97:54552 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver34:54432 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver75:50788 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver38:57228 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver100:56851 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver98:60144 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver149:59628 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver44:37682 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver97:54552 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver1:38813 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver91:47206 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver76:40021 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver31:37685 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver159:40160 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver15:58396 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver145:37716 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver61:37705 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver141:60941 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver136:33234 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:25 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver96:53017 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:26 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver96:53017 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:26 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver141:60941 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:26 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver163:50662 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:26 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver145:37716 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:26 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver84:34548 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 20.0 in stage 8.0 (TID 1281) in 762 ms on hadoop-slaver120 (1/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 15.0 in stage 8.0 (TID 1278) in 873 ms on hadoop-slaver98 (2/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 2.0 in stage 8.0 (TID 1271) in 892 ms on hadoop-slaver100 (3/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 12.0 in stage 8.0 (TID 1291) in 911 ms on hadoop-slaver97 (4/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 5.0 in stage 8.0 (TID 1290) in 914 ms on hadoop-slaver75 (5/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 4.0 in stage 8.0 (TID 1276) in 938 ms on hadoop-slaver38 (6/25)
16/03/29 15:15:26 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver163:50662 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 7.0 in stage 8.0 (TID 1280) in 955 ms on hadoop-slaver44 (7/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 10.0 in stage 8.0 (TID 1273) in 963 ms on hadoop-slaver76 (8/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 6.0 in stage 8.0 (TID 1286) in 974 ms on hadoop-slaver1 (9/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 9.0 in stage 8.0 (TID 1272) in 1019 ms on hadoop-slaver34 (10/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 11.0 in stage 8.0 (TID 1282) in 1186 ms on hadoop-slaver141 (11/25)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 23.0 in stage 8.0 (TID 1283) in 1187 ms on hadoop-slaver82 (12/25)
16/03/29 15:15:26 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver11:44070 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:26 INFO scheduler.TaskSetManager: Finished task 8.0 in stage 8.0 (TID 1287) in 1260 ms on hadoop-slaver145 (13/25)
16/03/29 15:15:27 INFO scheduler.TaskSetManager: Finished task 21.0 in stage 8.0 (TID 1292) in 1349 ms on hadoop-slaver149 (14/25)
16/03/29 15:15:27 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver136:33234 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:27 INFO storage.BlockManagerInfo: Added broadcast_10_piece0 in memory on hadoop-slaver142:59911 (size: 318.8 KB, free: 510.3 MB)
16/03/29 15:15:27 INFO scheduler.TaskSetManager: Finished task 19.0 in stage 8.0 (TID 1270) in 1569 ms on hadoop-slaver61 (15/25)
16/03/29 15:15:27 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 8.0 (TID 1293) in 1598 ms on hadoop-slaver163 (16/25)
16/03/29 15:15:27 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver84:34548 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:27 INFO storage.BlockManagerInfo: Added broadcast_6_piece0 in memory on hadoop-slaver142:59911 (size: 44.1 KB, free: 510.3 MB)
16/03/29 15:15:27 INFO scheduler.TaskSetManager: Finished task 13.0 in stage 8.0 (TID 1277) in 1958 ms on hadoop-slaver11 (17/25)
16/03/29 15:15:27 INFO scheduler.TaskSetManager: Finished task 3.0 in stage 8.0 (TID 1294) in 2018 ms on hadoop-slaver91 (18/25)
16/03/29 15:15:27 INFO scheduler.TaskSetManager: Finished task 14.0 in stage 8.0 (TID 1274) in 2267 ms on hadoop-slaver84 (19/25)
16/03/29 15:15:28 INFO scheduler.TaskSetManager: Finished task 17.0 in stage 8.0 (TID 1275) in 2717 ms on hadoop-slaver96 (20/25)
16/03/29 15:15:28 INFO scheduler.TaskSetManager: Finished task 16.0 in stage 8.0 (TID 1289) in 2733 ms on hadoop-slaver31 (21/25)
16/03/29 15:15:28 INFO scheduler.TaskSetManager: Finished task 18.0 in stage 8.0 (TID 1285) in 2864 ms on hadoop-slaver15 (22/25)
16/03/29 15:15:28 INFO scheduler.TaskSetManager: Finished task 1.0 in stage 8.0 (TID 1279) in 3129 ms on hadoop-slaver136 (23/25)
16/03/29 15:15:29 INFO scheduler.TaskSetManager: Finished task 22.0 in stage 8.0 (TID 1288) in 3308 ms on hadoop-slaver142 (24/25)
16/03/29 15:15:31 INFO scheduler.TaskSetManager: Finished task 24.0 in stage 8.0 (TID 1284) in 5445 ms on hadoop-slaver159 (25/25)
16/03/29 15:15:31 INFO cluster.YarnScheduler: Removed TaskSet 8.0, whose tasks have all completed, from pool 
16/03/29 15:15:31 INFO scheduler.DAGScheduler: ResultStage 8 (processCmd at CliDriver.java:376) finished in 5.448 s
16/03/29 15:15:31 INFO scheduler.DAGScheduler: Job 7 finished: processCmd at CliDriver.java:376, took 5.621305 s
16/03/29 15:15:31 INFO scheduler.StatsReportListener: Finished stage: org.apache.spark.scheduler.StageInfo@1251c1a
16/03/29 15:15:31 INFO scheduler.StatsReportListener: task runtime:(count: 25, mean: 1751.560000, stdev: 1086.831729, max: 5445.000000, min: 762.000000)
16/03/29 15:15:31 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:31 INFO scheduler.StatsReportListener:   762.0 ms        873.0 ms        892.0 ms        955.0 ms        1.3 s   2.3 s   3.1 s   3.3 s   5.4 s
16/03/29 15:15:31 INFO scheduler.StatsReportListener: task result size:(count: 25, mean: 2501.840000, stdev: 410.074401, max: 3304.000000, min: 2266.000000)
16/03/29 15:15:31 INFO scheduler.StatsReportListener:   0%      5%      10%     25%     50%     75%     90%     95%     100%
16/03/29 15:15:31 INFO scheduler.StatsReportListener:   2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.2 KB  2.6 KB  3.2 KB  3.2 KB  3.2 KB

一共弄了4次: 1 -> 2 -> 7 -> 25

–END

Hive on Spark

Mon 2016-03-28 18:20

先看官网的资源Hive on Spark: Getting Started 。文档是值得信任和有保证的，但是有前提：Spark版本得是hive/pom.xml中指定的。

重新编译spark(assembly包中去掉hive、hadoop)

这里hive-1.2.1用的是spark-1.3.1 !!!

[hadoop@cu2 spark-1.3.1]$ ./make-distribution.sh --name "hadoop2.6.3-without-hive" --tgz --mvn "$(which mvn)" -Pyarn,hadoop-provided,hadoop-2.6,parquet-provided -Dhadoop.version=2.6.3 -Dmaven.test.skip=true -Dmaven.javadoc.skip=true -DskipTests

拷贝打包好的 spark-1.3.1-bin-hadoop2.6.3-without-hive.tgz 到服务器。解压并做一个软链接到spark(或者指定 SPARK_HOME 环境变量 )，Hive不遗余力啊，把所有想的jar通过各种办法拿到 ( sparkHome=$(readlink -f $bin/../../spark) )。

[hadoop@hadoop-master2 ~]$ ln -s spark-1.3.1-bin-hadoop2.6.3-without-hive spark

把压缩包传到hdfs，这样每次启动任务就少传几百M的数据。后面spark.yarn.jar配置会用到
[hadoop@hadoop-master2 ~]$ cd spark/lib/
[hadoop@hadoop-master2 lib]$ hadoop fs -put spark-assembly-1.3.1-hadoop2.6.3.jar /spark/

做好软链接后效果：

[hadoop@hadoop-master2 ~]$ ll | grep -E "hive|spark"
drwxrwxr-x   9 hadoop hadoop 4096 1月  14 08:08 apache-hive-1.2.1-bin
lrwxrwxrwx   1 hadoop hadoop   21 1月  14 08:07 hive -> apache-hive-1.2.1-bin
lrwxrwxrwx   1 hadoop hadoop   40 3月  28 16:38 spark -> spark-1.3.1-bin-hadoop2.6.3-without-hive
drwxrwxr-x  10 hadoop hadoop 4096 3月  28 16:31 spark-1.3.1-bin-hadoop2.6.3-without-hive
drwxrwxr-x  12 hadoop hadoop 4096 3月  25 16:18 spark-1.6.0-bin-2.6.3
drwxrwxr-x  11 hadoop hadoop 4096 3月  28 11:15 spark-1.6.0-bin-hadoop2-without-hive

这里的spark-1.6.0是教训啊！记住最好最好用hive/pom.xml中spark的版本！！！

修改hive配置

由于spark会加载很多的class，需要把permsize调大。

[hadoop@hadoop-master2 ~]$ less ~/hive/conf/hive-env.sh
export HADOOP_OPTS="$HADOOP_OPTS -XX:MaxPermSize=256m -Dhive.home=${HIVE_HOME} "

在conf目录下增加spark-defaults.conf文件，指定spark的配置。动态资源分配查看：dynamic-resource-allocation：

[hadoop@hadoop-master2 conf]$ cat spark-defaults.conf 
spark.yarn.jar    hdfs:///spark/spark-assembly-1.3.1-hadoop2.6.3.jar

spark.dynamicAllocation.enabled    true
spark.shuffle.service.enabled      true
spark.dynamicAllocation.executorIdleTimeout    600
spark.dynamicAllocation.minExecutors    160 
spark.dynamicAllocation.maxExecutors    1800
spark.dynamicAllocation.schedulerBacklogTimeout   5

spark.driver.memory    10g
spark.driver.maxResultSize   0

spark.eventLog.enabled  true
spark.eventLog.compress  true
spark.eventLog.dir    hdfs:///spark-eventlogs
spark.yarn.historyServer.address hadoop-master2:18080


spark.serializer        org.apache.spark.serializer.KryoSerializer
spark.kryoserializer.buffer.max    512m

minExecutors 最好应该是和datanode机器数量差不多，每台一个executor才能本地计算嘛！
dynamicAllocation需要yarn的配合，具体查看前一篇文章，或者直接看官网的资料。
eventlog查看历史记录需要，配置好后每个任务的信息会存储到eventlog.dir的路径。通过18080端口可以看到历史记录。

跑起来

spark.master 默认是 yarn-cluster，这里先本地(local)跑一下看下效果。然后再改成yarn-cluster/yarn-client就可以了(推荐使用yarn-client，如果yarn-cluster模式AppMaster同时也是Driver，内存比较难控制，日志看起来也麻烦)。

[hadoop@hadoop-master2 hive]$ hive --hiveconf hive.execution.engine=spark 

hive> set spark.master=local;
hive> select count(*) from t_house_info ;
Query ID = hadoop_20160328163952_93dafddc-c8b1-4bc9-b851-5e51f6d26fa8
Total jobs = 1
Launching Job 1 out of 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Spark Job = 0

Query Hive on Spark job[0] stages:
0
1

Status: Running (Hive on Spark job[0])
Job Progress Format
CurrentTime StageId_StageAttemptId: SucceededTasksCount(+RunningTasksCount-FailedTasksCount)/TotalTasksCount [StageCost]
2016-03-28 16:40:02,077 Stage-0_0: 0(+1)/1      Stage-1_0: 0/1
2016-03-28 16:40:03,078 Stage-0_0: 1/1 Finished Stage-1_0: 1/1 Finished
Status: Finished successfully in 2.01 seconds
OK
1
Time taken: 10.169 seconds, Fetched: 1 row(s)
hive> 

再回过头看其实挺简单，和官方文档中的差不多。

注意：hive的日志级别可以通过 hive-log4j.properties 来配置。

有一个问题，不管yarn-cluser还是yarn-client（hive1.2.1-on-spark1.3.1），application强制kill掉以后，再查询会失败，应该是application杀了但是session还在！

[hadoop@file1 ~]$ yarn application -kill application_1460379750886_0012
16/04/13 08:47:17 INFO client.RMProxy: Connecting to ResourceManager at file1/192.168.102.6:8032
Killing application application_1460379750886_0012
16/04/13 08:47:18 INFO impl.YarnClientImpl: Killed application application_1460379750886_0012

    > select count(*) from t_info where edate=20160413;
Query ID = hadoop_20160413084736_ac8f88bb-5ee1-4941-9745-f4a8a504f2f3
Total jobs = 1
Launching Job 1 out of 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Spark Job = eb7e038a-2db0-45d7-9b0d-1e55d354e5e9
Status: Failed
FAILED: Execution Error, return code 3 from org.apache.hadoop.hive.ql.exec.spark.SparkTask

坑坑坑

刚开始弄的时刻，没管spark的版本的。直接上spark-1.6.0，然后完全跑不通，看hive.log的日志，啥都看不出来。最后查看http://markmail.org/message/reingwn556e7e37yHive on Spark的老大邮件列表的回复，把 spark.master=local 设置成本地跑才看到一点点有用的错误信息。

hive> set hive.execution.engine=spark;
hive> select count(*) from t_ods_access_log2 where day=20160327;
Query ID = hadoop_20160328083028_a9fb9860-38dc-4288-8415-b5b2b88f920a
Total jobs = 1
Launching Job 1 out of 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Failed to execute spark task, with exception 'org.apache.hadoop.hive.ql.metadata.HiveException(Failed to create spark client.)'
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.spark.SparkTask

日志里面’毛’有用信息都没有！

把日志级别调成debug（hive-log4j.properties），并把 set spark.master=local; 设置成本地。再跑日志：

2016-03-28 15:13:52,549 DEBUG internal.PlatformDependent (Slf4JLogger.java:debug(71)) - Javassist: unavailable
2016-03-28 15:13:52,549 DEBUG internal.PlatformDependent (Slf4JLogger.java:debug(71)) - You don't have Javassist in your class path or you don't have enough permission to load dynamically generated classes.  Please check the configuration for better performance.

2016-03-28 15:14:56,594 DEBUG storage.BlockManager (Logging.scala:logDebug(62)) - Putting block broadcast_0_piece0 without replication took  8 ms
2016-03-28 15:14:56,597 ERROR util.Utils (Logging.scala:logError(95)) - uncaught error in thread SparkListenerBus, stopping SparkContext
java.lang.AbstractMethodError
        at org.apache.spark.scheduler.SparkListenerBus$class.onPostEvent(SparkListenerBus.scala:62)
        at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31)
        at org.apache.spark.scheduler.LiveListenerBus.onPostEvent(LiveListenerBus.scala:31)
        at org.apache.spark.util.ListenerBus$class.postToAll(ListenerBus.scala:55)
        at org.apache.spark.util.AsynchronousListenerBus.postToAll(AsynchronousListenerBus.scala:37)
        at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(AsynchronousListenerBus.scala:80)
        at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(AsynchronousListenerBus.scala:65)
        at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(AsynchronousListenerBus.scala:65)
        at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
        at org.apache.spark.util.AsynchronousListenerBus$$anon$1$$anonfun$run$1.apply$mcV$sp(AsynchronousListenerBus.scala:64)
        at org.apache.spark.util.Utils$.tryOrStopSparkContext(Utils.scala:1180)
        at org.apache.spark.util.AsynchronousListenerBus$$anon$1.run(AsynchronousListenerBus.scala:63)

调用抽象方法的错误。然后查看了hive-1.2.1中 SparkListener实现类JobMetricsListener 确实没有(spark-1.6.0)62行错误的onBlockUpdated方法实现。然后把spark换成1.3.1一切就好了，其他就是文章前面写的。

心得: 刚刚开始用一个新东西的时刻，还是安装官网指定的版本来用省心。等到自己熟悉后，在玩其他的。

hive on spark VS SparkSQL VS hive on tez

前一篇已经弄好了SparkSQL，SparkSQL也有thriftserver服务，这里说说为啥还选择搞hive-on-spark：

SparkSQL-Thriftserver所有结果全部内存，快是快，但是不能满足查询大量数据的需求。如果查询几千万的数据，SparkSQL是搞不定的。而hive-on-spark除了计算用spark其他逻辑都是hive的，返回的结果会先写hdfs，再慢慢返回给客户端。
SparkSQL-Thriftserver代码的是全部用scala重写的，和已有hive业务不一定兼容！！
SparkSQL-Thriftserver有一个最大的优势就是整个server相当于hive-on-spark的一个session，网页监控漂亮清晰。而hive-on-spark不同的session那就相当于不同的application！！（2016-4-13 20:57:23）用了动态分配，没感觉SparkSQLThriftserver快很多。
SparkSQL由于基于内存，再一些调度方面做了优化。如[limit]: hive是死算，sparksql递增数据量的一次次的试。sparksql可以这么做的，毕竟算好的数据在内存里面放着。

hive和sparksql的理念不同，hive的存储是HDFS，而sparksql只是把HDFS作为持久化工具，它的数据基本都放内存。

查看hive的日志，可以看到返回结果后有写HDFS的动作体现，会有类似日志：

2016-03-28 19:39:25,687 INFO  exec.FileSinkOperator (Utilities.java:mvFileToFinalPath(1882))
 - Moving tmp dir: hdfs://zfcluster/hive/scratchdir/hadoop/de2b263e-9601-4df7-bc38-ba932ae83f42/hive_2016-03-28_19-38-08_834_7914607982986605890-1/-mr-10000/.hive-staging_hive_2016-03-28_19-38-08_834_7914607982986605890-1/_tmp.-ext-10001 
 to: hdfs://zfcluster/hive/scratchdir/hadoop/de2b263e-9601-4df7-bc38-ba932ae83f42/hive_2016-03-28_19-38-08_834_7914607982986605890-1/-mr-10000/.hive-staging_hive_2016-03-28_19-38-08_834_7914607982986605890-1/-ext-10001

tez的优势spark都有，并且tez其实缓冲优势并不大。而spark的缓冲效果更明显，而且可以快速返回。例如：你查3万条数据，tez是要全部查询然后再返回的，而sparksql取到3万条其他就不算了（效果看起来是这样子，具体没看源码实现；md hive-on-spark还是会全部跑）。
tez任务缓冲不能共享，spark更加细化，可以有process级别缓冲（就是用上次计算过的结果，加载过的缓冲）！例如，你查数据记录同时又要返回count，这时有些操作是prcess_local级别的，这个tez是不能比的！
spark的日志UI看起来更便捷，呵呵

单就从用的角度，spark全面取胜啊。

参考

–END

SparkSQL-on-YARN的Executors池(动态)配置

Fri 2016-03-25 15:14

官网配置资料

实战

修改YARN配置，添加spark-yarn-shuffle.jar，同步配置和jar到nodemanager节点并重启。

[hadoop@hadoop-master1 hadoop-2.6.3]$ vi etc/hadoop/yarn-site.xml 
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle,spark_shuffle</value>
</property>

<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>

<property>
<name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
<value>org.apache.spark.network.yarn.YarnShuffleService</value>
</property>

[hadoop@hadoop-master1 hadoop-2.6.3]$ cp ~/spark-1.6.0-bin-2.6.3/lib/spark-1.6.0-yarn-shuffle.jar share/hadoop/yarn/

for h in `cat etc/hadoop/slaves` ; do rsync -az share $h:~/hadoop-2.6.3/ ; done 
for h in `cat etc/hadoop/slaves` ; do rsync -az etc $h:~/hadoop-2.6.3/ ; done 

rsync -vaz etc hadoop-master2:~/hadoop-2.6.3/
rsync -vaz share hadoop-master2:~/hadoop-2.6.3/

[hadoop@hadoop-master1 hadoop-2.6.3]$ sbin/stop-yarn.sh 
[hadoop@hadoop-master1 hadoop-2.6.3]$ sbin/start-yarn.sh 

原来已经部署了Hive-1.2.1（和spark-1.6.0的hive是匹配的），直接把hive-site.xml做一个软链到spark/conf下面：

[hadoop@hadoop-master1 spark-1.6.0-bin-2.6.3]$ cd conf/
[hadoop@hadoop-master1 conf]$ ln -s ~/hive/conf/hive-site.xml 

[hadoop@hadoop-master1 spark-1.6.0-bin-2.6.3]$ ll conf/hive-site.xml 
lrwxrwxrwx. 1 hadoop hadoop 36 3月  25 11:30 conf/hive-site.xml -> /home/hadoop/hive/conf/hive-site.xml

注意：如果原来配置了tez，把hive-site.xml的 hive.execution.engine 配置注释掉。或者启动的时刻换引擎： bin/spark-sql --master yarn-client --hiveconf hive.execution.engine=mr

修改spark配置

spark-defaults.conf

[hadoop@hadoop-master1 conf]$ cat spark-defaults.conf 
spark.yarn.jar    hdfs:///spark/spark-assembly-1.6.0-hadoop2.6.3-ext-2.1.jar

spark.dynamicAllocation.enabled    true
spark.shuffle.service.enabled      true
spark.dynamicAllocation.executorIdleTimeout    600s
spark.dynamicAllocation.minExecutors    160
spark.dynamicAllocation.maxExecutors    1800
spark.dynamicAllocation.schedulerBacklogTimeout   5s

spark.driver.maxResultSize   0

spark.eventLog.enabled  true
spark.eventLog.compress  true
spark.eventLog.dir    hdfs:///spark-eventlogs
spark.yarn.historyServer.address hadoop-master2:18080

spark.serializer        org.apache.spark.serializer.KryoSerializer

spark.yarn.jar 配置后，spark启动后直接使用该文件作为executor的main-jar，不需要每次都上传一次spark.jar（每次都搞一下180M也不少资源了）
enabled 启用动态两个都配置必须设置为true
executorIdleTimeout 关闭不用executors需要等待的时间
schedulerBacklogTimeout 增加积压的任务时间来判断是否增加executors
minExecutors 至少存活的executors个数

spark-env.sh环境变量

[hadoop@hadoop-master1 conf]$ cat spark-env.sh 
SPARK_CLASSPATH=/home/hadoop/hive/lib/mysql-connector-java-5.1.21-bin.jar:$SPARK_CLASSPATH
HADOOP_CONF_DIR=/home/hadoop/hadoop/etc/hadoop
SPARK_DRIVER_MEMORY=30g
SPARK_PID_DIR=/home/hadoop/tmp/pids

启动

[hadoop@hadoop-master1 spark-1.6.0-bin-2.6.3]$ sbin/start-thriftserver.sh --master yarn-client

[hadoop@hadoop-master2 spark-1.6.0-bin-2.6.3]$ sbin/start-history-server.sh hdfs:///spark-eventlogs

# 不包括hadoop jars的情况下，自己编写脚本把hadoop的依赖包加入classpath
[hadoop@hadoop-master2 spark-1.3.1-bin-hadoop2.6.3-without-hive]$ cat start-historyserver.sh 
#!/bin/sh

bin=`dirname $0`
cd $bin

source $HADOOP_HOME/libexec/hadoop-config.sh

export SPARK_PID_DIR=/home/hadoop/tmp/pids
export SPARK_CLASSPATH=`hadoop classpath`

export SPARK_PID_DIR=/home/hadoop/tmp/pids

# http://spark.apache.org/docs/latest/monitoring.html#viewing-after-the-fact
export SPARK_HISTORY_OPTS="-Dspark.history.fs.update.interval=30s -Dspark.history.fs.cleaner.enabled=true -Dspark.history.fs.logDirectory=hdfs:///spark-eventlogs"
sbin/start-history-server.sh 

收工。

整个过程就是：添加spark-shuffle到yarn，然后配置spark参数，最后就是重启任务（yarn/hiveserver）。

–END

← Older Blog Archives Newer →

佛爷

来之不易, 且等且珍惜.
得之我幸; 不得-争-复争-且不得, 命也, 乐享天命, 福也.

GitHub Repos

Status updating…

@winse on GitHub

Winse Blog

走走停停都是风景, 熙熙攘攘都向最好, 忙忙碌碌都为明朝, 何畏之.

RPM打包

资料

实践

重新打包已有rpm

Parquet学习

经典文章

概念

texfile转parquet

Limit on Sparksql and Hive

hive1.2.1-on-spark1.3.1

sparksql1.6.0

Hive on Spark

重新编译spark(assembly包中去掉hive、hadoop)

修改hive配置

跑起来

坑坑坑

hive on spark VS SparkSQL VS hive on tez

参考

SparkSQL-on-YARN的Executors池(动态)配置

官网配置资料

实战

修改YARN配置，添加spark-yarn-shuffle.jar，同步配置和jar到nodemanager节点并重启。

原来已经部署了Hive-1.2.1（和spark-1.6.0的hive是匹配的），直接把hive-site.xml做一个软链到spark/conf下面：

修改spark配置

启动