cndaqiang / E5-PC-daily

服务器集群管理遇到的问题和总结
1 stars 0 forks source link

两节点 Xeon Platinum 9242 - Intel 配置 #39

Open cndaqiang opened 3 years ago

cndaqiang commented 3 years ago

信息备份

[root@node7 cndaqiang]# cat /etc/hosts
127.0.0.1   localhost localhost.localdomain localhost4 localhost4.localdomain4
::1         localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.1.1     managernode.localdomain managernode mngnode
192.168.1.10    node1.localdomain node1
192.168.1.11    node2.localdomain node2
192.168.1.12    node3.localdomain node3
192.168.1.13    node4.localdomain node4
192.168.1.14    node5.localdomain node5
192.168.1.16    node6.localdomain node6
192.168.1.17    node7.localdomain node7
192.168.1.18    node8.localdomain node8
192.168.1.19    node9.localdomain node9
192.168.1.20    node10.localdomain node10
172.16.0.1 ib_mngnode
172.16.0.10 ib_node1
172.16.0.11 ib_node2
172.16.0.12 ib_node3
172.16.0.13 ib_node4
172.16.0.14 ib_node5
172.16.0.17 ib_node7
172.16.0.18 ib_node8
172.16.0.19 ib_node9
172.16.0.20 ib_node10
[root@node7 cndaqiang]# cat /etc/sysconfig/network-scripts/ifcfg-eno1
TYPE=Ethernet
PROXY_METHOD=none
BROWSER_ONLY=no
BOOTPROTO=none
DEFROUTE=yes
IPV4_FAILURE_FATAL=no
IPV6INIT=yes
IPV6_AUTOCONF=yes
IPV6_DEFROUTE=yes
IPV6_FAILURE_FATAL=no
IPV6_ADDR_GEN_MODE=stable-privacy
NAME=eno1
UUID=aca3a44d-7ee8-4015-9767-6a53d388f80e
DEVICE=eno1
ONBOOT=yes
IPADDR=10.127.6.5
PREFIX=24
GATEWAY=10.127.6.254
IPV6_PRIVACY=no
[root@node7 cndaqiang]# cat /etc/sysconfig/network-scripts/ifcfg-ib0
CONNECTED_MODE=no
TYPE=InfiniBand
PROXY_METHOD=none
BROWSER_ONLY=no
BOOTPROTO=none
DEFROUTE=yes
IPV4_FAILURE_FATAL=no
IPV6INIT=yes
IPV6_AUTOCONF=yes
IPV6_DEFROUTE=yes
IPV6_FAILURE_FATAL=no
IPV6_ADDR_GEN_MODE=stable-privacy
NAME=ib0
UUID=36e569a3-f511-4516-9b94-08a612f2e8a9
DEVICE=ib0
ONBOOT=yes
IPADDR=172.16.0.17
PREFIX=24
cndaqiang commented 3 years ago

先配置node7

node7有账户密码,不重装来,仅修改配置 采用ipv6网络连接配置 禁用nfs挂载

[root@node7 cndaqiang]# cat /etc/fstab

#
# /etc/fstab
# Created by anaconda on Mon Dec  9 11:53:12 2019
#
# Accessible filesystems, by reference, are maintained under '/dev/disk'
# See man pages fstab(5), findfs(8), mount(8) and/or blkid(8) for more info
#
/dev/mapper/centos-root /                       xfs     defaults        0 0
UUID=f6c8b1e2-4663-4201-bf9b-414ab336963d /boot                   xfs     defaults        0 0
UUID=BF7A-7015          /boot/efi               vfat    umask=0077,shortname=winnt 0 0
#/dev/mapper/centos-home /home                   xfs     defaults        0 0
/dev/mapper/centos-swap swap                    swap    defaults        0 0
#172.16.0.1:/home        /home           nfs     defaults,_netdev        0 0
#172.16.0.1:/software       /software           nfs     defaults,_netdev        0 0

关闭nis认证

[root@node7 cndaqiang]# systemctl disable ypbind
Removed symlink /etc/systemd/system/multi-user.target.wants/ypbind.service.
[root@node7 cndaqiang]# systemctl disable rpcbind
Removed symlink /etc/systemd/system/multi-user.target.wants/rpcbind.service.

设置tuna地址

2402:f000:1:408:8100::1 mirrors6.tuna.tsinghua.edu.cn mirrors.tuna.tsinghua.edu.cn

更新依赖

yum update
yum install gcc

其他的都有问题,网络不好,其他的软件自己编译

cndaqiang commented 3 years ago

编译环境

一键配置脚本

mkdir -p ~/soft/gnu4-mvapich
cd ~/soft/gnu4-mvapich
ROOT=$PWD
mkdir $ROOT/source

echo -e"
#把下面的执行结果复制到环境变量
ROOT=$ROOT
MATHDIR=\$ROOT/math/lib
export LD_LIBRARY_PATH=\$ROOT/mvapich2-2.3.1/lib:\$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=\$ROOT/libxc-4.3.4/lib:\$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=\$ROOT/fftw-3.3.3/lib:\$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=\$ROOT/math/lib:\$LD_LIBRARY_PATH
export LIBRARY_PATH=\$ROOT/mvapich2-2.3.1/lib:\$LIBRARY_PATH
export LIBRARY_PATH=\$ROOT/libxc-4.3.4/lib:\$LIBRARY_PATH
export LIBRARY_PATH=\$ROOT/fftw-3.3.3/lib:\$LIBRARY_PATH
export LIBRARY_PATH=\$ROOT/math/lib:\$LIBRARY_PATH
export C_INCLUDE_PATH=\$ROOT/mvapich2-2.3.1/include:\$C_INCLUDE_PATH
export C_INCLUDE_PATH=\$ROOT/libxc-4.3.4/include:\$C_INCLUDE_PATH
export C_INCLUDE_PATH=\$ROOT/fftw-3.3.3/include:\$C_INCLUDE_PATH
export C_INCLUDE_PATH=\$ROOT/math/include:\$C_INCLUDE_PATH
export PATH=\$ROOT/mvapich2-2.3.1/bin:\$PATH
export PATH=\$ROOT/fftw-3.3.3/bin:\$PATH
"

#mvapic 
cd $ROOT/source
cp ~/packages/mirrors/mvapich2/mvapich2-2.3.1.tar.gz .
tar xzvf mvapich2-2.3.1.tar.gz
cd mvapich2-2.3.1/
./configure --prefix=$ROOT/mvapich2-2.3.1 CC=gcc FC=gfortran CXX=g++
make -j40
#安装
make install

#从0编译动态库scalapack
if [ ! -d $ROOT/math ]
then
    mkdir $ROOT/math
    mkdir $ROOT/math/lib
    mkdir $ROOT/math/include
fi

cd $ROOT/source
cp ~/packages//mirrors/math/lapack-3.8.0.tar.gz .
rm -rf lapack-3.8.0
tar xzvf lapack-3.8.0.tar.gz
cd lapack-3.8.0/
#修改编译参数,并默认输出动态库

#BLAS&LAPACK
echo -e "
all:libblas.so
libblas.so: \$(ALLOBJ)
\t \$(FORTRAN) -shared -Wl,-soname,\$@ -o \$@ \$(ALLOBJ)
" >> BLAS/SRC/Makefile
#TMG
echo -e "
all:libtmg.so
libtmg.so: \$(ALLOBJ)
\t \$(FORTRAN) -shared -Wl,-soname,\$@ -o \$@ \$(ALLOBJ)
" >> TESTING/MATGEN/Makefile
#LAPACK
echo -e "
all: liblapack.so
liblapack.so: \$(ALLOBJ)
\t \$(FORTRAN) -shared -Wl,-soname,\$@ -o \$@ \$(ALLOBJ)
" >> SRC/Makefile

cp make.inc.example make.inc
echo "OPTS += -fPIC" >> make.inc
echo "NOOPT += -fPIC" >> make.inc
echo -e "
MATHDIR=$ROOT/math/lib
install:all
\t cp $PWD/BLAS/SRC/libblas.so \$(MATHDIR)
\t cp $PWD/SRC/liblapack.so \$(MATHDIR)
\t cp $PWD/TESTING/MATGEN/libtmg.so \$(MATHDIR)
\t cp $PWD/*.a \$(MATHDIR)
" >> make.inc
#安装
make -j36 #必须用make,用其他参数/完成后再输make会编译测试程序,没必要
#不可以注释Makefile中
#   all: lapack_install lib blas_testing lapack_testing
#为
#   all: lapack_install lib
#   #blas_testing lapack_testing
#可以跳过测试的过程,编译更快
# 因为blas_testing依赖blas的编译
#如果非要注释,要添加依赖blas

#Scalapack
cd $ROOT/source
cp ~/packages//mirrors/math/scalapack-2.0.2.tgz .
rm -rf scalapack-2.0.2
tar xzvf scalapack-2.0.2.tgz
cd scalapack-2.0.2

#echo -e "
#ALLOBJ += \$(SLASRC) \$(DLASRC)  \$(CLASRC) \$(ZLASRC)  \
#\t   \$(SCLAUX) \$(DZLAUX) \$(ALLAUX)
#all:libscalapack.so
#libscalapack.so: \$(ALLOBJ)
#\t \$(FC) -shared -Wl,-soname,\$@ -o \$@ \$(ALLOBJ)
#" >> SRC/Makefile

#使用fPIC参数,这样静态库可以转为动态库
cp SLmake.inc.example SLmake.inc
echo "FC += -fPIC" >> SLmake.inc
echo "CC += -fPIC" >> SLmake.inc
echo -e "
MATHDIR=$ROOT/math/lib
install: all
\t \$(FC) -shared -o $PWD/libscalapack.so -Wl,--whole-archive $PWD/libscalapack.a -Wl,--no-whole-archive
\t cp $PWD/libscalapack.so \$(MATHDIR)
" >> SLmake.inc
#不支持-j20,就得逐个编译,把每个库依次打包添加到libscalapack.a,没编译成功也有libscalapack.a
make  install

#====================
#其他可选
#libxc-4.3.4 for octopus-10.1
cd $ROOT/source
cp ~/packages//mirrors/libxc/libxc-4.3.4.tar.gz .
tar xzvf libxc-4.3.4.tar.gz 
cd libxc-4.3.4
#--enable-shared 动态库
./configure --prefix=$ROOT/libxc-4.3.4  CC=gcc CXX=g++ FC=gfortran --enable-shared 
make -j30; make install

#fft
cd $ROOT/source
cp ~/packages//mirrors/fftw/fftw-3.3.3.tar.gz .
tar xzvf fftw-3.3.3.tar.gz
cd fftw-3.3.3/
./configure --prefix=$ROOT/fftw-3.3.3 --enable-mpi --enable-shared
make -j30; make install
cndaqiang commented 3 years ago
#ipv6源
sudo su
echo '2402:f000:1:408:8100::1 mirrors6.tuna.tsinghua.edu.cn mirrors.tuna.tsinghua.edu.cn' >> /etc/hosts
cp /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak
cat << EOF > /etc/yum.repos.d/CentOS-Base.repo
# CentOS-Base.repo
#
# The mirror system uses the connecting IP address of the client and the
# update status of each mirror to pick mirrors that are updated to and
# geographically close to the client.  You should use this for CentOS updates
# unless you are manually picking other mirrors.
#
# If the mirrorlist= does not work for you, as a fall back you can try the
# remarked out baseurl= line instead.
#
#

[base]
name=CentOS-\$releasever - Base
baseurl=https://mirrors.tuna.tsinghua.edu.cn/centos/\$releasever/os/\$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=\$releasever&arch=\$basearch&repo=os
enabled=1
gpgcheck=1
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-7

#released updates
[updates]
name=CentOS-\$releasever - Updates
baseurl=https://mirrors.tuna.tsinghua.edu.cn/centos/\$releasever/updates/\$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=\$releasever&arch=\$basearch&repo=updates
enabled=1
gpgcheck=1
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-7

#additional packages that may be useful
[extras]
name=CentOS-\$releasever - Extras
baseurl=https://mirrors.tuna.tsinghua.edu.cn/centos/\$releasever/extras/\$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=\$releasever&arch=\$basearch&repo=extras
enabled=1
gpgcheck=1
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-7

#additional packages that extend functionality of existing packages
[centosplus]
name=CentOS-\$releasever - Plus
baseurl=https://mirrors.tuna.tsinghua.edu.cn/centos/\$releasever/centosplus/\$basearch/
#mirrorlist=http://mirrorlist.centos.org/?release=\$releasever&arch=\$basearch&repo=centosplus
gpgcheck=1
enabled=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-7

EOF
#清华源的GPG名称和centos的不同啊,这里这样解决
yum makecache
cp /etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 /etc/pki/rpm-gpg/RPM-GPG-KEY-7

#epel 源

cd /etc/pki/rpm-gpg
wget https://mirrors.tuna.tsinghua.edu.cn/epel/RPM-GPG-KEY-EPEL-7

yum -y install epel-release

sed -e 's!^metalink=!#metalink=!g' \
    -e 's!^#baseurl=!baseurl=!g' \
    -e 's!//download\.fedoraproject\.org/pub!//mirrors.tuna.tsinghua.edu.cn!g' \
    -e 's!http://mirrors\.tuna!https://mirrors.tuna!g' \
    -i /etc/yum.repos.d/epel.repo /etc/yum.repos.d/epel-testing.repo

#node7
#主机名
hostnamectl --static set-hostname master

sed -i 's/BOOTPROTO=dhcp/BOOTPROTO=none/g'  /etc/sysconfig/network-scripts/ifcfg-ib0
sed -i 's/ONBOOT=no/ONBOOT=yes/g'  /etc/sysconfig/network-scripts/ifcfg-ib0
echo IPADDR=172.16.100.7  >> /etc/sysconfig/network-scripts/ifcfg-ib0
echo PREFIX=24 >> /etc/sysconfig/network-scripts/ifcfg-ib0

#node8
hostnamectl --static set-hostname client01

sed -i 's/BOOTPROTO=dhcp/BOOTPROTO=none/g'  /etc/sysconfig/network-scripts/ifcfg-ib0
sed -i 's/ONBOOT=no/ONBOOT=yes/g'  /etc/sysconfig/network-scripts/ifcfg-ib0
echo IPADDR=172.16.100.8  >> /etc/sysconfig/network-scripts/ifcfg-ib0
echo PREFIX=24 >> /etc/sysconfig/network-scripts/ifcfg-ib0

echo 172.16.100.7 master >> /etc/hosts
echo 172.16.100.8 client01 >> /etc/hosts

ifup ib0

iptables -A INPUT -i ib0 -j ACCEPT
iptables -I INPUT -s 172.16.100.0/24 -j ACCEPT
iptables -P OUTPUT ACCEPT

#剩下的同https://cndaqiang.github.io//2019/09/19/Centos7-CC19/

#不同之处,nfs目录权限不同
echo /home 172.16.100.0/24(rw,no_root_squash,sync) >> /etc/exports
echo /opt  172.16.100.0/24(rw,no_root_squash,sync) >> /etc/exports
echo /share *(rw,no_root_squash,sync) >> /etc/exports

master:/home     /home                   nfs     defaults        0 0
master:/opt      /opt                    nfs     defaults        0 0

用于mac传输文件

(python37) cndaqiang@mac ~$ sudo mkdir /Users/node7 (python37) cndaqiang@mac ~$ sudo chown cndaqiang:cndaqiang /Users/node7 (python37) cndaqiang@mac ~$ sudo mount_nfs -P -o nolocks,nosuid node7:/share /Users/node7


## 如果ib网卡不识别,或着识别后配置后无法启用,安装ib驱动
yum install -y infiniband-diags
yum install -y opensm
systemctl start opensm
systemctl enable opensm

#后期编译mvapich时需要ib库
yum install -y libibverbs
yum install -y libibverbs-devel
yum install -y libibmad-devel

#在不存在ib交换机的情况下,两台机器直连,一台掉线,另一台ib自动离线
#另一台基于ib的ip的相关配置就失效了
#所以文件软件共享/nis使用ib
#不要设置pbs
#只共享公共软件目录.home目录也不共享
#两台机器都启动后,人工启动服务

export LD_LIBRARY_PATH=/usr/lib64/libibverbs:$LD_LIBRARY_PATH

##编译环境

ROOT=/opt/gnu-mvapich
mkdir -p $ROOT
mkdir $ROOT/source

#创建编译脚本
cd $ROOT/source
cat << EOF > ./make.sh
if [ -e ./netlib.py ]
then
    ./setup.py --prefix=$ROOT/math --downall
else
    make -j10
    make
    make install
fi
EOF

MATHDIR=$ROOT/math/lib
export LD_LIBRARY_PATH=$ROOT/libxc-4.3.4/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$ROOT/fftw-3.3.3/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$ROOT/math/lib:$LD_LIBRARY_PATH
export LIBRARY_PATH=$ROOT/libxc-4.3.4/lib:$LIBRARY_PATH
export LIBRARY_PATH=$ROOT/fftw-3.3.3/lib:$LIBRARY_PATH
export LIBRARY_PATH=$ROOT/math/lib:$LIBRARY_PATH
export C_INCLUDE_PATH=$ROOT/libxc-4.3.4/include:$C_INCLUDE_PATH
export C_INCLUDE_PATH=$ROOT/fftw-3.3.3/include:$C_INCLUDE_PATH
export C_INCLUDE_PATH=$ROOT/math/include:$C_INCLUDE_PATH
export PATH=$ROOT/fftw-3.3.3/bin:$PATH

#mvapich
cd $ROOT/source
wget https://cndaqiang.gitee.io/packages/mirrors/mvapich2/mvapich2-2.3.1.tar.gz 
tar xzvf mvapich2-2.3.1.tar.gz
cd mvapich2-2.3.1/
./configure --prefix=$ROOT/mvapich2-2.3.1 CC=gcc FC=gfortran CXX=g++
make -j40
#安装
make install

#libxc-4.3.4 for octopus-10.1
cd $ROOT/source
wget https://cndaqiang.gitee.io/packages//mirrors/libxc/libxc-4.3.4.tar.gz
tar xzvf libxc-4.3.4.tar.gz 
cd libxc-4.3.4
#--enable-shared 动态库
./configure --prefix=$ROOT/libxc-4.3.4  CC=gcc CXX=g++ FC=gfortran --enable-shared 
bash ../make.sh

#fft
cd $ROOT/source
wget https://cndaqiang.gitee.io/packages//mirrors/fftw/fftw-3.3.3.tar.gz
tar xzvf fftw-3.3.3.tar.gz
cd fftw-3.3.3/
./configure --prefix=$ROOT/fftw-3.3.3 --enable-mpi --enable-shared
bash ../make.sh

#从0编译动态库scalapack
if [ ! -d $ROOT/math ]
then
    mkdir $ROOT/math
    mkdir $ROOT/math/lib
    mkdir $ROOT/math/include
fi

cd $ROOT/source
wget https://cndaqiang.gitee.io/packages//mirrors/math/lapack-3.8.0.tar.gz
rm -rf lapack-3.8.0
tar xzvf lapack-3.8.0.tar.gz
cd lapack-3.8.0/
#修改编译参数,并默认输出动态库

#BLAS&LAPACK
echo -e "
all:libblas.so
libblas.so: \$(ALLOBJ)
\t \$(FORTRAN) -shared -Wl,-soname,\$@ -o \$@ \$(ALLOBJ)
" >> BLAS/SRC/Makefile
#TMG
echo -e "
all:libtmg.so
libtmg.so: \$(ALLOBJ)
\t \$(FORTRAN) -shared -Wl,-soname,\$@ -o \$@ \$(ALLOBJ)
" >> TESTING/MATGEN/Makefile
#LAPACK
echo -e "
all: liblapack.so
liblapack.so: \$(ALLOBJ)
\t \$(FORTRAN) -shared -Wl,-soname,\$@ -o \$@ \$(ALLOBJ)
" >> SRC/Makefile

cp make.inc.example make.inc
echo "OPTS += -fPIC" >> make.inc
echo "NOOPT += -fPIC" >> make.inc
echo -e "
MATHDIR=$ROOT/math/lib
install:all
\t cp $PWD/BLAS/SRC/libblas.so \$(MATHDIR)
\t cp $PWD/SRC/liblapack.so \$(MATHDIR)
\t cp $PWD/TESTING/MATGEN/libtmg.so \$(MATHDIR)
\t cp $PWD/*.a \$(MATHDIR)
" >> make.inc
#安装
make -j36 #必须用make,用其他参数/完成后再输make会编译测试程序,没必要
#不可以注释Makefile中
#   all: lapack_install lib blas_testing lapack_testing
#为
#   all: lapack_install lib
#   #blas_testing lapack_testing
#可以跳过测试的过程,编译更快
# 因为blas_testing依赖blas的编译
#如果非要注释,要添加依赖blas

#Scalapack
cd $ROOT/source
wget https://cndaqiang.gitee.io/packages//mirrors/math/scalapack-2.0.2.tgz
rm -rf scalapack-2.0.2
tar xzvf scalapack-2.0.2.tgz
cd scalapack-2.0.2

#echo -e "
#ALLOBJ += \$(SLASRC) \$(DLASRC)  \$(CLASRC) \$(ZLASRC)  \
#\t   \$(SCLAUX) \$(DZLAUX) \$(ALLAUX)
#all:libscalapack.so
#libscalapack.so: \$(ALLOBJ)
#\t \$(FC) -shared -Wl,-soname,\$@ -o \$@ \$(ALLOBJ)
#" >> SRC/Makefile

#使用fPIC参数,这样静态库可以转为动态库
cp SLmake.inc.example SLmake.inc
echo "FC += -fPIC" >> SLmake.inc
echo "CC += -fPIC" >> SLmake.inc
echo -e "
MATHDIR=$ROOT/math/lib
install: all
\t \$(FC) -shared -o $PWD/libscalapack.so -Wl,--whole-archive $PWD/libscalapack.a -Wl,--no-whole-archive
\t cp $PWD/libscalapack.so \$(MATHDIR)
" >> SLmake.inc
#不支持-j20,就得逐个编译,把每个库依次打包添加到libscalapack.a,没编译成功也有libscalapack.a
make  install