# 下载解压
[root@hadoop01 software]# wget http://archive.cloudera.com/cdh5/cdh/5/sqoop-1.4.6-cdh5.7.0.tar.gz
[hadoop@hadoop01 software]$ tar -zxvf sqoop-1.4.6-cdh5.7.0.tar.gz -C /home/hadoop/app/
[hadoop@hadoop01 software]$ cd ~/app/
# 配置环境变量
[root@hadoop01 ~]# vi /etc/profile
export SQOOP_HOME=/home/hadoop/app/sqoop-1.4.6-cdh5.7.0
export PATH=$SQOOP_HOME/bin:$HIVE_HOME/bin:$HADOOP_HOME/bin:$JAVA_HOME/bin:$PATH
[root@hadoop01 ~]# source /etc/profile
# 修改sqoop配置文件
[hadoop@hadoop01 ~]$ cp app/sqoop-1.4.6-cdh5.7.0/conf/sqoop-env-template.sh app/sqoop-1.4.6-cdh5.7.0/conf/sqoop-env.sh
[hadoop@hadoop01 ~]$ vi app/sqoop-1.4.6-cdh5.7.0/conf/sqoop-env.sh
export HADOOP_MAPRED_HOME=/home/hadoop/app/hadoop-2.6.0-cdh5.7.0
export HADOOP_MAPRED_HOME=/home/hadoop/app/hadoop-2.6.0-cdh5.7.0
export HIVE_HOME=/home/hadoop/app/apache-hive-1.1.0-cdh5.7.0-bin
# 拷贝MySQL驱动包到sqoop的lib目录下
[hadoop@hadoop01 ~]$ cp ~/software/mysql-connector-java-5.1.46/mysql-connector-java-5.1.46.jar /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/lib
# 查看sqoop版本
[hadoop@hadoop01 ~]$ sqoop version
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hbase does not exist! HBase imports will fail.
Please set $HBASE_HOME to the root of your HBase installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hcatalog does not exist! HCatalog jobs will fail.
Please set $HCAT_HOME to the root of your HCatalog installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../zookeeper does not exist! Accumulo imports will fail.
Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
18/06/25 23:49:04 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.7.0
Sqoop 1.4.6-cdh5.7.0
git commit id
Compiled by jenkins on Wed Mar 23 11:30:51 PDT 2016
[hadoop@hadoop01 ~]$
3. Sqoop基本用法
sqoop help
[hadoop@hadoop01 app]$ sqoop help
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hbase does not exist! HBase imports will fail.
Please set $HBASE_HOME to the root of your HBase installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hcatalog does not exist! HCatalog jobs will fail.
Please set $HCAT_HOME to the root of your HCatalog installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../zookeeper does not exist! Accumulo imports will fail.
Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
18/06/26 20:53:54 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.7.0
usage: sqoop COMMAND [ARGS]
Available commands:
codegen Generate code to interact with database records
create-hive-table Import a table definition into Hive
eval Evaluate a SQL statement and display the results
export Export an HDFS directory to a database table
help List available commands
import Import a table from a database to HDFS
import-all-tables Import tables from a database to HDFS
import-mainframe Import datasets from a mainframe server to HDFS
job Work with saved jobs
list-databases List available databases on a server
list-tables List available tables in a database
merge Merge results of incremental imports
metastore Run a standalone Sqoop metastore
version Display version information
See 'sqoop help COMMAND' for information on a specific command.
[hadoop@hadoop01 app]$
sqoop list-databases 查看mysql库
[hadoop@hadoop01 ~]$ sqoop list-databases \ # 查看mysql中的库
> --connect jdbc:mysql://localhost:3306 \
> --username root --password root
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hbase does not exist! HBase imports will fail.
Please set $HBASE_HOME to the root of your HBase installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hcatalog does not exist! HCatalog jobs will fail.
Please set $HCAT_HOME to the root of your HCatalog installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../zookeeper does not exist! Accumulo imports will fail.
Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
18/06/25 23:51:25 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.7.0
18/06/25 23:51:25 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/06/25 23:51:25 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
information_schema
hive_basic01
mysql
performance_schema
[hadoop@hadoop01 ~]$
sqoop list-tables 查看mysql指定库的所有表
[hadoop@hadoop01 hadoop-2.6.0-cdh5.7.0]$ sqoop list-tables \ # 查看mysql中库mysql中的表
> --connect jdbc:mysql://localhost:3306/mysql \
> --username root --password root
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hbase does not exist! HBase imports will fail.
Please set $HBASE_HOME to the root of your HBase installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hcatalog does not exist! HCatalog jobs will fail.
Please set $HCAT_HOME to the root of your HCatalog installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../zookeeper does not exist! Accumulo imports will fail.
Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
18/06/12 00:12:46 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.7.0
18/06/12 00:12:46 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/06/12 00:12:47 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
columns_priv
db
event
func
general_log
help_category
help_keyword
help_relation
help_topic
innodb_index_stats
innodb_table_stats
ndb_binlog_index
plugin
proc
procs_priv
proxies_priv
servers
slave_master_info
slave_relay_log_info
slave_worker_info
slow_log
tables_priv
time_zone
time_zone_leap_second
time_zone_name
time_zone_transition
time_zone_transition_type
user
[hadoop@hadoop01 hadoop-2.6.0-cdh5.7.0]$
# hive中创建程式信息表
hive> use default;
hive> create table city_info(city_id int,city_name string,area string) row format delimited fields terminated by '\t';
# sqoop从mysql导入数据到hive
[hadoop@hadoop01 ~]$ sqoop import --connect jdbc:mysql://localhost:3306/db_1 --username root --password root --table city_info --hive-table default.city_info --hive-import --fields-terminated-by '\t' --hive-overwrite -m 1
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hbase does not exist! HBase imports will fail.
Please set $HBASE_HOME to the root of your HBase installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hcatalog does not exist! HCatalog jobs will fail.
Please set $HCAT_HOME to the root of your HCatalog installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../zookeeper does not exist! Accumulo imports will fail.
Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
18/06/26 23:49:54 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.7.0
18/06/26 23:49:54 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/06/26 23:49:54 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/06/26 23:49:54 INFO tool.CodeGenTool: Beginning code generation
18/06/26 23:49:55 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `city_info` AS t LIMIT 1
18/06/26 23:49:55 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `city_info` AS t LIMIT 1
18/06/26 23:49:55 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /home/hadoop/app/hadoop-2.6.0-cdh5.7.0
Note: /tmp/sqoop-hadoop/compile/57da79d0e430f9fdaaf006c8fc513222/city_info.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/06/26 23:49:56 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-hadoop/compile/57da79d0e430f9fdaaf006c8fc513222/city_info.jar
18/06/26 23:49:56 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/06/26 23:49:56 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/06/26 23:49:56 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/06/26 23:49:56 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/06/26 23:49:56 INFO mapreduce.ImportJobBase: Beginning import of city_info
18/06/26 23:49:57 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/06/26 23:49:57 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/06/26 23:49:57 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/06/26 23:50:02 INFO db.DBInputFormat: Using read commited transaction isolation
18/06/26 23:50:03 INFO mapreduce.JobSubmitter: number of splits:1
18/06/26 23:50:03 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1528733327581_0007
18/06/26 23:50:03 INFO impl.YarnClientImpl: Submitted application application_1528733327581_0007
18/06/26 23:50:03 INFO mapreduce.Job: The url to track the job: http://hadoop01:8088/proxy/application_1528733327581_0007/
18/06/26 23:50:03 INFO mapreduce.Job: Running job: job_1528733327581_0007
18/06/26 23:50:10 INFO mapreduce.Job: Job job_1528733327581_0007 running in uber mode : false
18/06/26 23:50:10 INFO mapreduce.Job: map 0% reduce 0%
18/06/26 23:50:15 INFO mapreduce.Job: map 100% reduce 0%
18/06/26 23:50:15 INFO mapreduce.Job: Job job_1528733327581_0007 completed successfully
18/06/26 23:50:15 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=136872
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=87
HDFS: Number of bytes written=128
HDFS: Number of read operations=4
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Other local map tasks=1
Total time spent by all maps in occupied slots (ms)=2811
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=2811
Total vcore-seconds taken by all map tasks=2811
Total megabyte-seconds taken by all map tasks=2878464
Map-Reduce Framework
Map input records=10
Map output records=10
Input split bytes=87
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=56
CPU time spent (ms)=1190
Physical memory (bytes) snapshot=218820608
Virtual memory (bytes) snapshot=2789232640
Total committed heap usage (bytes)=230162432
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=128
18/06/26 23:50:15 INFO mapreduce.ImportJobBase: Transferred 128 bytes in 17.3706 seconds (7.3688 bytes/sec)
18/06/26 23:50:15 INFO mapreduce.ImportJobBase: Retrieved 10 records.
18/06/26 23:50:15 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `city_info` AS t LIMIT 1
18/06/26 23:50:15 INFO hive.HiveImport: Loading uploaded data into Hive
Logging initialized using configuration in jar:file:/home/hadoop/app/apache-hive-1.1.0-cdh5.7.0-bin/lib/hive-common-1.1.0-cdh5.7.0.jar!/hive-log4j.properties
OK
Time taken: 0.989 seconds
Loading data to table default.city_info
Table default.city_info stats: [numFiles=1, numRows=0, totalSize=128, rawDataSize=0]
OK
Time taken: 0.648 seconds
[hadoop@hadoop01 ~]$
# 查看hive表,数据导入成功
hive> select * from city_info;
OK
1 BEIJING NC
2 SHANGHAI EC
3 NANJING EC
4 GUANGZHOU SC
5 SANYA SC
6 WUHAN CC
7 CHANGSHA CC
8 XIAN NW
9 CHENGDU SW
10 HAERBIN NE
Time taken: 0.076 seconds, Fetched: 10 row(s)
hive>
# hive中创建产品信息表
hive> create table product_info(product_id int,product_name string,extend_info string) row format delimited fields terminated by '\t';
OK
Time taken: 0.126 seconds
hive>
# sqoop将mysql数据导入hive表
[hadoop@hadoop01 ~]$ sqoop import --connect jdbc:mysql://localhost:3306/db_1 --username root --password root --table product_info --hive-table default.product_info --hive-import --fields-terminated-by '\t' --hive-overwrite -m 1
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hbase does not exist! HBase imports will fail.
Please set $HBASE_HOME to the root of your HBase installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hcatalog does not exist! HCatalog jobs will fail.
Please set $HCAT_HOME to the root of your HCatalog installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../zookeeper does not exist! Accumulo imports will fail.
Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
18/06/26 23:56:09 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.7.0
18/06/26 23:56:09 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/06/26 23:56:10 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/06/26 23:56:10 INFO tool.CodeGenTool: Beginning code generation
18/06/26 23:56:10 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `product_info` AS t LIMIT 1
18/06/26 23:56:10 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `product_info` AS t LIMIT 1
18/06/26 23:56:10 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /home/hadoop/app/hadoop-2.6.0-cdh5.7.0
Note: /tmp/sqoop-hadoop/compile/19c3ac4754d2301f535018709e32483e/product_info.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/06/26 23:56:12 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-hadoop/compile/19c3ac4754d2301f535018709e32483e/product_info.jar
18/06/26 23:56:12 WARN manager.MySQLManager: It looks like you are importing from mysql.
18/06/26 23:56:12 WARN manager.MySQLManager: This transfer can be faster! Use the --direct
18/06/26 23:56:12 WARN manager.MySQLManager: option to exercise a MySQL-specific fast path.
18/06/26 23:56:12 INFO manager.MySQLManager: Setting zero DATETIME behavior to convertToNull (mysql)
18/06/26 23:56:12 INFO mapreduce.ImportJobBase: Beginning import of product_info
18/06/26 23:56:12 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/06/26 23:56:13 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/06/26 23:56:13 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/06/26 23:56:18 INFO db.DBInputFormat: Using read commited transaction isolation
18/06/26 23:56:19 INFO mapreduce.JobSubmitter: number of splits:1
18/06/26 23:56:19 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1528733327581_0008
18/06/26 23:56:20 INFO impl.YarnClientImpl: Submitted application application_1528733327581_0008
18/06/26 23:56:20 INFO mapreduce.Job: The url to track the job: http://hadoop01:8088/proxy/application_1528733327581_0008/
18/06/26 23:56:20 INFO mapreduce.Job: Running job: job_1528733327581_0008
18/06/26 23:56:27 INFO mapreduce.Job: Job job_1528733327581_0008 running in uber mode : false
18/06/26 23:56:27 INFO mapreduce.Job: map 0% reduce 0%
18/06/26 23:56:32 INFO mapreduce.Job: map 100% reduce 0%
18/06/26 23:56:33 INFO mapreduce.Job: Job job_1528733327581_0008 completed successfully
18/06/26 23:56:33 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=136903
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=87
HDFS: Number of bytes written=3078
HDFS: Number of read operations=4
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=1
Other local map tasks=1
Total time spent by all maps in occupied slots (ms)=3088
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=3088
Total vcore-seconds taken by all map tasks=3088
Total megabyte-seconds taken by all map tasks=3162112
Map-Reduce Framework
Map input records=91
Map output records=91
Input split bytes=87
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=54
CPU time spent (ms)=1240
Physical memory (bytes) snapshot=221388800
Virtual memory (bytes) snapshot=2789806080
Total committed heap usage (bytes)=228589568
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=3078
18/06/26 23:56:33 INFO mapreduce.ImportJobBase: Transferred 3.0059 KB in 20.3942 seconds (150.9254 bytes/sec)
18/06/26 23:56:33 INFO mapreduce.ImportJobBase: Retrieved 91 records.
18/06/26 23:56:33 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `product_info` AS t LIMIT 1
18/06/26 23:56:33 INFO hive.HiveImport: Loading uploaded data into Hive
Logging initialized using configuration in jar:file:/home/hadoop/app/apache-hive-1.1.0-cdh5.7.0-bin/lib/hive-common-1.1.0-cdh5.7.0.jar!/hive-log4j.properties
OK
Time taken: 1.34 seconds
Loading data to table default.product_info
Table default.product_info stats: [numFiles=1, numRows=0, totalSize=3078, rawDataSize=0]
OK
Time taken: 0.561 seconds
[hadoop@hadoop01 ~]$
# 查看hive的产品表,已经有数据进来
hive> select * from product_info;
OK
1 product1 {"product_status":1}
2 product2 {"product_status":1}
3 product3 {"product_status":1}
4 product4 {"product_status":1}
5 product5 {"product_status":1}
6 product6 {"product_status":1}
7 product7 {"product_status":1}
8 product8 {"product_status":1}
9 product9 {"product_status":0}
10 product10 {"product_status":1}
11 product11 {"product_status":0}
12 product12 {"product_status":0}
13 product13 {"product_status":0}
14 product14 {"product_status":0}
15 product15 {"product_status":1}
16 product16 {"product_status":0}
17 product17 {"product_status":1}
18 product18 {"product_status":0}
19 product19 {"product_status":1}
20 product20 {"product_status":1}
21 product21 {"product_status":0}
22 product22 {"product_status":0}
23 product23 {"product_status":0}
24 product24 {"product_status":0}
25 product25 {"product_status":1}
26 product26 {"product_status":1}
27 product27 {"product_status":0}
28 product28 {"product_status":1}
29 product29 {"product_status":0}
30 product30 {"product_status":0}
31 product31 {"product_status":0}
32 product32 {"product_status":0}
33 product33 {"product_status":1}
34 product34 {"product_status":1}
35 product35 {"product_status":0}
36 product36 {"product_status":0}
37 product37 {"product_status":1}
38 product38 {"product_status":0}
39 product39 {"product_status":0}
40 product40 {"product_status":1}
41 product41 {"product_status":1}
42 product42 {"product_status":1}
43 product43 {"product_status":1}
44 product44 {"product_status":0}
45 product45 {"product_status":1}
46 product46 {"product_status":1}
47 product47 {"product_status":0}
48 product48 {"product_status":1}
49 product49 {"product_status":1}
50 product50 {"product_status":1}
51 product51 {"product_status":1}
52 product52 {"product_status":0}
53 product53 {"product_status":0}
54 product54 {"product_status":1}
55 product55 {"product_status":0}
56 product56 {"product_status":0}
57 product57 {"product_status":1}
58 product58 {"product_status":1}
59 product59 {"product_status":1}
60 product60 {"product_status":1}
61 product61 {"product_status":0}
62 product62 {"product_status":1}
63 product63 {"product_status":1}
64 product64 {"product_status":0}
65 product65 {"product_status":0}
66 product66 {"product_status":1}
67 product67 {"product_status":1}
68 product68 {"product_status":0}
69 product69 {"product_status":1}
70 product70 {"product_status":0}
71 product71 {"product_status":0}
72 product72 {"product_status":0}
73 product73 {"product_status":1}
74 product74 {"product_status":0}
75 product75 {"product_status":1}
76 product76 {"product_status":0}
77 product77 {"product_status":0}
78 product78 {"product_status":1}
79 product79 {"product_status":0}
80 product80 {"product_status":0}
81 product81 {"product_status":0}
82 product82 {"product_status":1}
83 product83 {"product_status":1}
84 product84 {"product_status":1}
85 product85 {"product_status":0}
86 product86 {"product_status":1}
87 product87 {"product_status":1}
88 product88 {"product_status":1}
89 product89 {"product_status":1}
90 product90 {"product_status":1}
91 product91 {"product_status":1}
92 product92 {"product_status":0}
93 product93 {"product_status":0}
94 product94 {"product_status":1}
95 product95 {"product_status":0}
96 product96 {"product_status":0}
97 product97 {"product_status":1}
98 product98 {"product_status":1}
99 product99 {"product_status":0}
100 product100 {"product_status":1}
Time taken: 0.059 seconds, Fetched: 100 row(s)
hive>
3 - 现在三张表都在hive中,联合查询得到想要的结果
================================
**user_click a**
user_id int
session_id string
action_time string
city_id int
product_id int
create_date string
===============================
**city_info b**
city_id int
city_name string
area string
================================
**product_info c**
product_id int
product_name string
extend_info string
================================
select * from
(
select aa.product_id,aa.product_name,aa.area,aa.click_count,action_date,row_number() over(partition by aa.area order by aa.click_count desc) rank
from
(select a.product_id,c.product_name,b.area,count(b.area) as click_count,to_date(a.action_time) as action_date
from user_click a
left join city_info b on a.city_id = b.city_id
left join product_info c on a.product_id = c.product_id
where a.city_id != 0 and a.product_id != 0
group by a.product_id,c.product_name,b.area,to_date(a.action_time)
) aa
) aaa
where aaa.rank <= 3
;
hive> select * from
> (
> select aa.product_id,aa.product_name,aa.area,aa.click_count,action_date,row_number() over(partition by aa.area order by aa.click_count desc) rank
> from
> (select a.product_id,c.product_name,b.area,count(b.area) as click_count,to_date(a.action_time) as action_date
> from user_click a
> left join city_info b on a.city_id = b.city_id
> left join product_info c on a.product_id = c.product_id
> where a.city_id != 0 and a.product_id != 0
> group by a.product_id,c.product_name,b.area,to_date(a.action_time)
> ) aa
> ) aaa
> where aaa.rank <= 3
> ;
Query ID = hadoop_20180626234242_f93f26a1-4f3a-44c4-8488-85c290667e75
Total jobs = 2
Execution log at: /tmp/hadoop/hadoop_20180626234242_f93f26a1-4f3a-44c4-8488-85c290667e75.log
2018-06-27 02:43:42 Starting to launch local task to process map join; maximum memory = 477626368
2018-06-27 02:43:43 Dump the side-table for tag: 1 with group count: 100 into file: file:/tmp/hadoop/91f8752f-237a-4d19-94ea-b25f2850efab/hive_2018-06-27_02-43-37_794_6506428922693384148-1/-local-10007/HashTable-Stage-3/MapJoin-mapfile231--.hashtable
2018-06-27 02:43:43 Uploaded 1 File to: file:/tmp/hadoop/91f8752f-237a-4d19-94ea-b25f2850efab/hive_2018-06-27_02-43-37_794_6506428922693384148-1/-local-10007/HashTable-Stage-3/MapJoin-mapfile231--.hashtable (3165 bytes)
2018-06-27 02:43:43 Dump the side-table for tag: 1 with group count: 10 into file: file:/tmp/hadoop/91f8752f-237a-4d19-94ea-b25f2850efab/hive_2018-06-27_02-43-37_794_6506428922693384148-1/-local-10007/HashTable-Stage-3/MapJoin-mapfile241--.hashtable
2018-06-27 02:43:43 Uploaded 1 File to: file:/tmp/hadoop/91f8752f-237a-4d19-94ea-b25f2850efab/hive_2018-06-27_02-43-37_794_6506428922693384148-1/-local-10007/HashTable-Stage-3/MapJoin-mapfile241--.hashtable (480 bytes)
2018-06-27 02:43:43 End of local task; Time Taken: 1.427 sec.
Execution completed successfully
MapredLocal task succeeded
Launching Job 1 out of 2
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1528733327581_0033, Tracking URL = http://hadoop01:8088/proxy/application_1528733327581_0033/
Kill Command = /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin/hadoop job -kill job_1528733327581_0033
Hadoop job information for Stage-3: number of mappers: 1; number of reducers: 1
2018-06-27 02:43:50,616 Stage-3 map = 0%, reduce = 0%
2018-06-27 02:43:56,786 Stage-3 map = 100%, reduce = 0%, Cumulative CPU 4.85 sec
2018-06-27 02:44:04,010 Stage-3 map = 100%, reduce = 100%, Cumulative CPU 7.25 sec
MapReduce Total cumulative CPU time: 7 seconds 250 msec
Ended Job = job_1528733327581_0033
Launching Job 2 out of 2
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1528733327581_0034, Tracking URL = http://hadoop01:8088/proxy/application_1528733327581_0034/
Kill Command = /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin/hadoop job -kill job_1528733327581_0034
Hadoop job information for Stage-4: number of mappers: 1; number of reducers: 1
2018-06-27 02:44:11,129 Stage-4 map = 0%, reduce = 0%
2018-06-27 02:44:16,287 Stage-4 map = 100%, reduce = 0%, Cumulative CPU 1.51 sec
2018-06-27 02:44:22,475 Stage-4 map = 100%, reduce = 100%, Cumulative CPU 4.92 sec
MapReduce Total cumulative CPU time: 4 seconds 920 msec
Ended Job = job_1528733327581_0034
MapReduce Jobs Launched:
Stage-Stage-3: Map: 1 Reduce: 1 Cumulative CPU: 7.25 sec HDFS Read: 741503 HDFS Write: 25824 SUCCESS
Stage-Stage-4: Map: 1 Reduce: 1 Cumulative CPU: 4.92 sec HDFS Read: 33659 HDFS Write: 566 SUCCESS
Total MapReduce CPU Time Spent: 12 seconds 170 msec
OK
7 product7 CC 39 2016-05-05 1
26 product26 CC 39 2016-05-05 2
70 product70 CC 38 2016-05-05 3
4 product4 EC 40 2016-05-05 1
96 product96 EC 32 2016-05-05 2
5 product5 EC 31 2016-05-05 3
9 product9 NC 16 2016-05-05 1
40 product40 NC 16 2016-05-05 2
5 product5 NC 13 2016-05-05 3
56 product56 NW 20 2016-05-05 1
67 product67 NW 20 2016-05-05 2
48 product48 NW 19 2016-05-05 3
38 product38 SC 35 2016-05-05 1
98 product98 SC 34 2016-05-05 2
33 product33 SC 34 2016-05-05 3
16 product16 SW 20 2016-05-05 1
95 product95 SW 19 2016-05-05 2
60 product60 SW 19 2016-05-05 3
Time taken: 46.789 seconds, Fetched: 18 row(s)
hive>
# 创建hive表,把查询结果存储到hive结果表中
hive> create table click_result (
> product_id int,
> product_name string,
> area string,
> click_count int,
> action_date string,
> rank int)
> row format delimited fields terminated by '\t';
OK
Time taken: 0.102 seconds
hive>
hive> insert overwrite table click_result
> select * from
> (
> select aa.product_id,aa.product_name,aa.area,aa.click_count,action_date,row_number() over(partition by aa.area order by aa.click_count desc) rank
> from
> (select a.product_id,c.product_name,b.area,count(b.area) as click_count,to_date(a.action_time) as action_date
> from user_click a
> left join city_info b on a.city_id = b.city_id
> left join product_info c on a.product_id = c.product_id
> where a.city_id != 0 and a.product_id != 0
> group by a.product_id,c.product_name,b.area,to_date(a.action_time)
> ) aa
> ) aaa
> where aaa.rank <= 3
> ;
Query ID = hadoop_20180626234242_f93f26a1-4f3a-44c4-8488-85c290667e75
Total jobs = 2
Execution log at: /tmp/hadoop/hadoop_20180626234242_f93f26a1-4f3a-44c4-8488-85c290667e75.log
2018-06-27 02:57:58 Starting to launch local task to process map join; maximum memory = 477626368
2018-06-27 02:57:59 Dump the side-table for tag: 1 with group count: 100 into file: file:/tmp/hadoop/91f8752f-237a-4d19-94ea-b25f2850efab/hive_2018-06-27_02-57-53_850_3892712994273534029-1/-local-10005/HashTable-Stage-3/MapJoin-mapfile271--.hashtable
2018-06-27 02:57:59 Uploaded 1 File to: file:/tmp/hadoop/91f8752f-237a-4d19-94ea-b25f2850efab/hive_2018-06-27_02-57-53_850_3892712994273534029-1/-local-10005/HashTable-Stage-3/MapJoin-mapfile271--.hashtable (3165 bytes)
2018-06-27 02:57:59 Dump the side-table for tag: 1 with group count: 10 into file: file:/tmp/hadoop/91f8752f-237a-4d19-94ea-b25f2850efab/hive_2018-06-27_02-57-53_850_3892712994273534029-1/-local-10005/HashTable-Stage-3/MapJoin-mapfile281--.hashtable
2018-06-27 02:57:59 Uploaded 1 File to: file:/tmp/hadoop/91f8752f-237a-4d19-94ea-b25f2850efab/hive_2018-06-27_02-57-53_850_3892712994273534029-1/-local-10005/HashTable-Stage-3/MapJoin-mapfile281--.hashtable (480 bytes)
2018-06-27 02:57:59 End of local task; Time Taken: 1.431 sec.
Execution completed successfully
MapredLocal task succeeded
Launching Job 1 out of 2
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1528733327581_0037, Tracking URL = http://hadoop01:8088/proxy/application_1528733327581_0037/
Kill Command = /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin/hadoop job -kill job_1528733327581_0037
Hadoop job information for Stage-3: number of mappers: 1; number of reducers: 1
2018-06-27 02:58:06,264 Stage-3 map = 0%, reduce = 0%
2018-06-27 02:58:12,483 Stage-3 map = 100%, reduce = 0%, Cumulative CPU 5.41 sec
2018-06-27 02:58:18,667 Stage-3 map = 100%, reduce = 100%, Cumulative CPU 7.9 sec
MapReduce Total cumulative CPU time: 7 seconds 900 msec
Ended Job = job_1528733327581_0037
Launching Job 2 out of 2
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
Starting Job = job_1528733327581_0038, Tracking URL = http://hadoop01:8088/proxy/application_1528733327581_0038/
Kill Command = /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/bin/hadoop job -kill job_1528733327581_0038
Hadoop job information for Stage-4: number of mappers: 1; number of reducers: 1
2018-06-27 02:58:24,442 Stage-4 map = 0%, reduce = 0%
2018-06-27 02:58:29,587 Stage-4 map = 100%, reduce = 0%, Cumulative CPU 1.56 sec
2018-06-27 02:58:36,827 Stage-4 map = 100%, reduce = 100%, Cumulative CPU 5.36 sec
MapReduce Total cumulative CPU time: 5 seconds 360 msec
Ended Job = job_1528733327581_0038
Loading data to table default.click_result
Table default.click_result stats: [numFiles=1, numRows=18, totalSize=566, rawDataSize=548]
MapReduce Jobs Launched:
Stage-Stage-3: Map: 1 Reduce: 1 Cumulative CPU: 7.9 sec HDFS Read: 741503 HDFS Write: 25824 SUCCESS
Stage-Stage-4: Map: 1 Reduce: 1 Cumulative CPU: 5.36 sec HDFS Read: 34125 HDFS Write: 643 SUCCESS
Total MapReduce CPU Time Spent: 13 seconds 260 msec
OK
Time taken: 45.279 seconds
hive> select * from click_result;
OK
7 product7 CC 39 2016-05-05 1
26 product26 CC 39 2016-05-05 2
70 product70 CC 38 2016-05-05 3
4 product4 EC 40 2016-05-05 1
96 product96 EC 32 2016-05-05 2
5 product5 EC 31 2016-05-05 3
9 product9 NC 16 2016-05-05 1
40 product40 NC 16 2016-05-05 2
5 product5 NC 13 2016-05-05 3
56 product56 NW 20 2016-05-05 1
67 product67 NW 20 2016-05-05 2
48 product48 NW 19 2016-05-05 3
38 product38 SC 35 2016-05-05 1
98 product98 SC 34 2016-05-05 2
33 product33 SC 34 2016-05-05 3
16 product16 SW 20 2016-05-05 1
95 product95 SW 19 2016-05-05 2
60 product60 SW 19 2016-05-05 3
Time taken: 0.063 seconds, Fetched: 18 row(s)
hive>
4 - 将hive中的计算结果导出到mysql
# 在mysql创建结果表click_result
mysql> create table click_result (
-> product_id int,
-> product_name varchar(255),
-> area varchar(255),
-> click_count int,
-> action_date varchar(255),
-> rank int
-> );
Query OK, 0 rows affected (0.13 sec)
mysql>
#sqoop将hive结果表数据导出到mysql
[hadoop@hadoop01 ~]$ sqoop export --connect jdbc:mysql://localhost:3306/db_1 --username root --password root --table click_result --export-dir /user/hive/warehouse/click_result --input-fields-terminated-by '\t'
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hbase does not exist! HBase imports will fail.
Please set $HBASE_HOME to the root of your HBase installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../hcatalog does not exist! HCatalog jobs will fail.
Please set $HCAT_HOME to the root of your HCatalog installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../accumulo does not exist! Accumulo imports will fail.
Please set $ACCUMULO_HOME to the root of your Accumulo installation.
Warning: /home/hadoop/app/sqoop-1.4.6-cdh5.7.0/../zookeeper does not exist! Accumulo imports will fail.
Please set $ZOOKEEPER_HOME to the root of your Zookeeper installation.
18/06/27 03:16:31 INFO sqoop.Sqoop: Running Sqoop version: 1.4.6-cdh5.7.0
18/06/27 03:16:31 WARN tool.BaseSqoopTool: Setting your password on the command-line is insecure. Consider using -P instead.
18/06/27 03:16:31 INFO manager.MySQLManager: Preparing to use a MySQL streaming resultset.
18/06/27 03:16:31 INFO tool.CodeGenTool: Beginning code generation
18/06/27 03:16:32 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `click_result` AS t LIMIT 1
18/06/27 03:16:32 INFO manager.SqlManager: Executing SQL statement: SELECT t.* FROM `click_result` AS t LIMIT 1
18/06/27 03:16:32 INFO orm.CompilationManager: HADOOP_MAPRED_HOME is /home/hadoop/app/hadoop-2.6.0-cdh5.7.0
Note: /tmp/sqoop-hadoop/compile/98550cf0ebad34886cf008380807403f/click_result.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
18/06/27 03:16:33 INFO orm.CompilationManager: Writing jar file: /tmp/sqoop-hadoop/compile/98550cf0ebad34886cf008380807403f/click_result.jar
18/06/27 03:16:33 INFO mapreduce.ExportJobBase: Beginning export of click_result
18/06/27 03:16:34 INFO Configuration.deprecation: mapred.jar is deprecated. Instead, use mapreduce.job.jar
18/06/27 03:16:34 INFO Configuration.deprecation: mapred.map.max.attempts is deprecated. Instead, use mapreduce.map.maxattempts
18/06/27 03:16:35 INFO Configuration.deprecation: mapred.reduce.tasks.speculative.execution is deprecated. Instead, use mapreduce.reduce.speculative
18/06/27 03:16:35 INFO Configuration.deprecation: mapred.map.tasks.speculative.execution is deprecated. Instead, use mapreduce.map.speculative
18/06/27 03:16:35 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps
18/06/27 03:16:35 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
18/06/27 03:16:38 INFO input.FileInputFormat: Total input paths to process : 1
18/06/27 03:16:38 INFO input.FileInputFormat: Total input paths to process : 1
18/06/27 03:16:38 INFO mapreduce.JobSubmitter: number of splits:4
18/06/27 03:16:38 INFO Configuration.deprecation: mapred.map.tasks.speculative.execution is deprecated. Instead, use mapreduce.map.speculative
18/06/27 03:16:39 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1528733327581_0043
18/06/27 03:16:39 INFO impl.YarnClientImpl: Submitted application application_1528733327581_0043
18/06/27 03:16:39 INFO mapreduce.Job: The url to track the job: http://hadoop01:8088/proxy/application_1528733327581_0043/
18/06/27 03:16:39 INFO mapreduce.Job: Running job: job_1528733327581_0043
18/06/27 03:16:46 INFO mapreduce.Job: Job job_1528733327581_0043 running in uber mode : false
18/06/27 03:16:46 INFO mapreduce.Job: map 0% reduce 0%
18/06/27 03:16:52 INFO mapreduce.Job: map 25% reduce 0%
18/06/27 03:16:53 INFO mapreduce.Job: map 50% reduce 0%
18/06/27 03:16:54 INFO mapreduce.Job: map 75% reduce 0%
18/06/27 03:16:55 INFO mapreduce.Job: map 100% reduce 0%
18/06/27 03:16:55 INFO mapreduce.Job: Job job_1528733327581_0043 completed successfully
18/06/27 03:16:55 INFO mapreduce.Job: Counters: 30
File System Counters
FILE: Number of bytes read=0
FILE: Number of bytes written=546816
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=2176
HDFS: Number of bytes written=0
HDFS: Number of read operations=19
HDFS: Number of large read operations=0
HDFS: Number of write operations=0
Job Counters
Launched map tasks=4
Data-local map tasks=4
Total time spent by all maps in occupied slots (ms)=15974
Total time spent by all reduces in occupied slots (ms)=0
Total time spent by all map tasks (ms)=15974
Total vcore-seconds taken by all map tasks=15974
Total megabyte-seconds taken by all map tasks=16357376
Map-Reduce Framework
Map input records=18
Map output records=18
Input split bytes=671
Spilled Records=0
Failed Shuffles=0
Merged Map outputs=0
GC time elapsed (ms)=439
CPU time spent (ms)=3710
Physical memory (bytes) snapshot=849416192
Virtual memory (bytes) snapshot=11142557696
Total committed heap usage (bytes)=1166016512
File Input Format Counters
Bytes Read=0
File Output Format Counters
Bytes Written=0
18/06/27 03:16:55 INFO mapreduce.ExportJobBase: Transferred 2.125 KB in 20.8443 seconds (104.3929 bytes/sec)
18/06/27 03:16:55 INFO mapreduce.ExportJobBase: Exported 18 records.
[hadoop@hadoop01 ~]$
# mysql已经有结果了
mysql> select * from click_result;
+------------+--------------+------+-------------+-------------+------+
| product_id | product_name | area | click_count | action_date | rank |
+------------+--------------+------+-------------+-------------+------+
| 33 | product33 | SC | 34 | 2016-05-05 | 3 |
| 16 | product16 | SW | 20 | 2016-05-05 | 1 |
| 95 | product95 | SW | 19 | 2016-05-05 | 2 |
| 60 | product60 | SW | 19 | 2016-05-05 | 3 |
| 7 | product7 | CC | 39 | 2016-05-05 | 1 |
| 26 | product26 | CC | 39 | 2016-05-05 | 2 |
| 70 | product70 | CC | 38 | 2016-05-05 | 3 |
| 4 | product4 | EC | 40 | 2016-05-05 | 1 |
| 96 | product96 | EC | 32 | 2016-05-05 | 2 |
| 5 | product5 | EC | 31 | 2016-05-05 | 3 |
| 9 | product9 | NC | 16 | 2016-05-05 | 1 |
| 40 | product40 | NC | 16 | 2016-05-05 | 2 |
| 5 | product5 | NC | 13 | 2016-05-05 | 3 |
| 56 | product56 | NW | 20 | 2016-05-05 | 1 |
| 67 | product67 | NW | 20 | 2016-05-05 | 2 |
| 48 | product48 | NW | 19 | 2016-05-05 | 3 |
| 38 | product38 | SC | 35 | 2016-05-05 | 1 |
| 98 | product98 | SC | 34 | 2016-05-05 | 2 |
+------------+--------------+------+-------------+-------------+------+
18 rows in set (0.00 sec)
mysql>
1. Sqoop
Sqoop: SQL to Hadoop 场景:数据在RDBMS中,我们如何使用Hive或者Hadoop来进行数据分析呢? 1) RDBMS ==> Hadoop 2) Hadoop ==> RDBMS MapReduce InputFormat OutputFormat Sqoop: RDBMS和Hadoop之间的一个桥梁
Sqoop版本 Sqoop 1.x: 1.4.7 底层是通过MapReduce来实现的,而且是只有map没有reduce的 RDBMS ==> HDFS jdbc
Sqoop 2.x: 1.99.7 与上面的1.4.7版本属于两个完全不同的分支,仅作了解。
2. Sqoop安装
3. Sqoop基本用法
sqoop help
sqoop list-databases 查看mysql库
sqoop list-tables 查看mysql指定库的所有表
数据导入: RDBMS ==>> HDFS
sqoop import
实操
业务场景 MySQL:两张表 city_info表和product_info, Hive:用户点击数据文件 user_click.txt 需求:计算每个地区(城市归属于地区,例如上海属于华东区)热度Top3的产品名称和点击数量等信息,然后要把计算的结果导出到MySQL中作为可视化使用。
步骤: 1 - 在Hive中创建城市信息表,使用sqoop从mysql导入城市信息表数据到hive城市信息表
2 - 在Hive中创建产品信息表,使用sqoop将mysql产品信息表数据导入hive的产品信息表
3 - 现在三张表都在hive中,联合查询得到想要的结果
4 - 将hive中的计算结果导出到mysql