Open kgneng2 opened 3 years ago
#!/bin/bash
KAFKA_CONNECT_HOME=/home1/irteam/apps/kafka-connect BIN_DIRECTORY=${KAFKA_CONNECT_HOME}/bin LOG_DIRECTORY=${KAFKA_CONNECT_HOME}/log GC_LOG_DIRECTORY=${KAFKA_CONNECT_HOME}/log/gc LOG_PROPERTIES=${KAFKA_CONNECT_HOME}/conf/connect-log4j.properties CONNECT_PROPERTIES=${KAFKA_CONNECT_HOME}/conf/connect-distributed.properties
CONFLUENT_HOME=/home1/irteam/apps/confluent CONFLUENT_KAFKA_CONNECT_SCRIPT=${CONFLUENT_HOME}/bin/connect-distributed
cd ${KAFKA_CONNECT_HOME}
export KAFKA_HEAP_OPTS="-Xmx2g -Xms2g"
export KAFKA_JVM_PERFORMANCE_OPTS="-server -XX:+UseG1GC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:$GC_LOG_DIRECTORY/gc.log -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=2M -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=$GC_LOG_DIRECTORY/heapdump.hprof" export KAFKA_LOG4J_OPTS="-Dlog4j.configuration=file:${LOG_PROPERTIES}"
PIDFile="application.pid"
function check_if_pid_file_exists { if [[ ! -f $PIDFile ]] then echo "PID file not found: $PIDFile" fi }
function check_if_process_is_running { if [[ ! -f $PIDFile ]] then return 1 else if ps -p $(print_process) > /dev/null then return 0 else return 1 fi fi }
function print_process { echo $(<"$PIDFile") }
case "$1" in status) check_if_pid_file_exists if check_if_process_is_running then echo $(print_process)" is running" else echo "Process not running: $(print_process)" fi ;; stop) check_if_pid_file_exists if ! check_if_process_is_running then echo "Process $(print_process) already stopped" exit 0 fi kill -TERM $(print_process) echo "Waiting for process to stop" NOT_KILLED=1 for i in {1..60}; do if check_if_process_is_running then echo "." sleep 1 else NOT_KILLED=0 fi done echo if [[ $NOT_KILLED = 1 ]] then echo "Cannot kill process $(print_process)" exit 1 fi echo "Process stopped" ;; start) if [[ -f $PIDFile ]] && check_if_process_is_running then echo "Process $(print_process) already running" exit 1 fi
nohup ${CONFLUENT_KAFKA_CONNECT_SCRIPT} ${CONNECT_PROPERTIES} 2>&1 > /dev/null &
PID=$!
echo ${PID} > ${PIDFile}
echo "Process ${PID} started"
;;
restart)
$0 stop
if [[ $? = 1 ]]
then
exit 1
fi
$0 start
;;
*)
echo "Usage: $0 {start|stop|restart|status}"
exit 1
esac exit 0
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Define the root logger with appender file
log4j.rootLogger=INFO, stdout, file
log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=[%d] %p %m (%c:%L)%n
log4j.appender.file=org.apache.log4j.RollingFileAppender log4j.appender.file.File=/home1/irteam/apps/kafka-connect/log/kafka-connect.log log4j.appender.file.MaxFileSize=100MB log4j.appender.file.MaxBackupIndex=100 log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout.ConversionPattern=[%d] %p %m (%c:%L)%n
log4j.logger.org.apache.zookeeper=ERROR log4j.logger.org.I0Itec.zkclient=ERROR log4j.logger.org.reflections=ERROR
#!/bin/bash
# irteam 계정 기준 kafka-connect 관련 디렉토리 생성
mkdir -p /home1/irteam/apps/kafka-connect/log/gc
mkdir -p /home1/irteam/apps/kafka-connect/bin
mkdir -p /home1/irteam/apps/kafka-connect/conf
cd /home1/irteam/apps
# conflueht zip file download
# https://docs.confluent.io/current/installation/installing_cp/zip-tar.html
curl -O http://packages.confluent.io/archive/5.0/confluent-oss-5.0.1-2.11.zip
unzip confluent-oss-5.0.1-2.11.zip
ln -s confluent-5.0.1 confluent
#!/bin/bash
cd /home1/irteam/apps
curl -O http://apache.mirror.cdnetworks.com/hadoop/common/hadoop-3.0.3/hadoop-3.0.3.tar.gz
tar xvfz hadoop-3.0.3.tar.gz
ln -s hadoop-3.0.3 hadoop
type.name=dunkerque
connector.class=io.confluent.connect.elasticsearch.ElasticsearchSinkConnector
topics=biz-platform-dunkerque
tasks.max=1
batch.size=5000
transforms=dailyIndex
max.retries=3
key.ignore=true
max.in.flight.requests=5
transforms.dailyIndex.type=org.apache.kafka.connect.transforms.TimestampRouter
max.buffered.records=20000
schema.ignore=true
transforms.dailyIndex.timestamp.format=YYYYMMdd
transforms.dailyIndex.topic.format=dunkerque-${timestamp}
value.converter.schemas.enable=false
value.converter=org.apache.kafka.connect.json.JsonConverter
connection.url=http://test-elastic001.ncl.nfra.io:9200,http://test-elastic002.ncl.nfra.io:9200,http://test-elastic003.ncl.nfra.io:9200,http://test-elastic004.ncl.nfra.io:9200,http://test-elastic005.ncl.nfra.io:9200,http://test-elastic006.ncl.nfra.io:9200,http://test-elastic007.ncl.nfra.io:9200,http://test-elastic011-ncl.nfra.io:9200,http://test-elastic012-ncl.nfra.io:9200,http://test-elastic013-ncl.nfra.io:9200,http://test-elastic014-ncl.nfra.io:9200,http://test-elastic015-ncl.nfra.io:9200,http://test-elastic016-ncl.nfra.io:9200,http://test-elastic017-ncl.nfra.io:9200,http://test-elastic018-ncl.nfra.io:9200,http://test-elastic019-ncl.nfra.io:9200,http://test-elastic020-ncl.nfra.io:9200
read.timeout.ms=9000
connector.class=io.confluent.connect.elasticsearch.ElasticsearchSinkConnector
type.name=htl_autocomplete_clk
transforms.Router.topic.format=${topic}-${timestamp}
schema.compatibility=BACKWARD
topics=htl-autocomplete-clk
tasks.max=3
batch.size=1000
transforms=Router
key.ignore=true // documentId 값을 무시한다. topic+partition+offset 으로 진행
schema.ignore=true // mapping 무시해서 넣는다.
key.converter.schemas.enable=false
value.converter.schema.registry.url=http://dev.schema.navercorp.com:8081
transforms.Router.timestamp.format=yyyyMMdd
connection.url=http://dev-hotel-es000-ncl:9200,http://dev-hotel-es001-ncl:9200,http://dev-hotel-es002-ncl:9200
value.converter=io.confluent.connect.avro.AvroConverter
key.converter=org.apache.kafka.connect.storage.StringConverter
transforms.Router.type=org.apache.kafka.connect.transforms.TimestampRouter
read.timeout.ms=9000
The Elasticsearch connector writes data from different topics in Kafka to different indices. All data for a topic will have the same type in Elasticsearch. This allows an independent evolution of schemas for data from different topics. This simplifies the schema evolution because Elasticsearch has one enforcement on mappings; that is, all fields with the same name in the same index must have the same mapping.
Elasticsearch supports dynamic mapping: when it encounters previously unknown field in a document, it uses dynamic mapping to determine the datatype for the field and automatically adds the new field to the type mapping.
When dynamic mapping is enabled, the Elasticsearch connector supports schema evolution. This is because mappings in Elasticsearch are more flexible than the schema evolution allowed in Connect when different converters are used. For example, when the Avro converter is used, backward, forward, and fully compatible schema evolutions are allowed.
When dynamic mapping is enabled, the Elasticsearch connector allows the following schema changes:
Adding Fields: Adding one or more fields to Kafka messages. Elasticsearch adds the new fields to the mapping when dynamic mapping is enabled. Removing Fields: Removing one or more fields from Kafka messages. Missing fields are treated as the null value defined for those fields in the mapping. Changing types that can be merged: Changing a field from integer type to string type. Elasticsearch can convert integers to strings. The following change is not allowed:
Changing types that can not be merged: Changing a field from a string type to an integer type. Because mappings are more flexible, schema compatibility should be enforced when writing data to Kafka.
kafka-connector-elasticsearch
https://github.com/confluentinc/kafka-connect-elasticsearch/blob/master/src/main/java/io/confluent/connect/elasticsearch/ElasticsearchSinkConnector.java
exactly once
mapping
Reindexing
test server
kafka connector , kafka
es
install guide
plugin.path
에 jar 파일 넣어서 설치 한다.Configuration
구축함으로써 얻을 수 있는 장점
테스트 및 더 조사해볼 점
mapping 및 parsing error시에 worker 가 죽는데, 다시 살리거나 무시하는경우 찾기.
schema evolution test 필요
logTime 및 timestamp indexing
drop.invalid.message, behavior.on.malformed.documents 여기 값 지정시 error log가 찍히지 않는다.
kafka connector elasticsearch cluster 구성
sh $HOME/appliaction [start/stop/restart]
value.converter=org.apache.kafka.connect.json.JsonConverter value.converter.schemas.enable=false
key.converter.schema.registry.url=http://dev.schema.navercorp.com:8081/
value.converter.schema.registry.url=http://dev.schema.navercorp.com:8081 group.id=test-velvet-connect-cluster
offset.storage.topic=test-velvet-connect-offsets offset.storage.replication.factor=1
config.storage.topic=test-velvet-connect-configs config.storage.replication.factor=1
status.storage.topic=test-velvet-connect-status status.storage.replication.factor=1
offset.flush.interval.ms=10000
plugin.path=/home1/irteam/apps/kafka-connector-hdfs/plugins
plugin.path=/home1/irteam/apps/confluent/share/java,/home1/irteam/apps/confluent/share/confluent-hub-components
access.control.allow.methods=GET,POST,PUT,OPTIONS access.control.allow.origin=*
topic.schema.ignore=true
topic.key.ignore=true
drop.invalid.message=true
behavior.on.null.values=ignore
behavior.on.malformed.documents=ignore
error
errors.tolerance=all erros.log.enable=true errors.log.include.messages=true
vshopping-log-click example (es)