Commit f29779c7 authored by gerd's avatar gerd

improved deployment system


git-svn-id: https://gps.dynxs.de/private/svn/app-plasma/trunk@229 55289a75-7b90-4627-9e07-ffb4263930b2
parent 2de6c688
......@@ -23,6 +23,8 @@ for mapred demo
12. map task generation bug - DONE
13. allow bigger blocksizes - DONE
mount -o soft,port=2801,mountport=2800,nolock office3:/foo /mnt
mapred_job_exec: also take memory load into account, especially for the
shuffle jobs
- or: automatically adjust mem consumption if memory is tight
......@@ -65,6 +67,9 @@ CHECK: reduce output should use repl=0 - DONE
[Sun Jun 6 01:22:17 2010] [Nn_manager] [crit] run: Exception Rpc_server.Connection_lost
(nnoffice3.log.old)
Plasma_client: find_coord funktioniert nicht wenn nicht alle namenodes
angegeben sind
sort_limit: better int instead of int64
more investigations on how to optimize sort
......
......@@ -4,14 +4,18 @@
# does NOT initialize datanodes
usage () {
echo "usage: $0 inst_name" >&2
echo "usage: $0 [-only-config] inst_name" >&2
exit 2
}
set -e
inst=""
only_config=0
while [ $# -gt 0 ]; do
case "$1" in
-only-config)
only_config=1; shift ;;
-*)
usage ;;
*)
......@@ -63,8 +67,10 @@ for host in $hosts; do
scp instances/$inst/*.conf instances/$inst/*.hosts instances/$inst/*.sh \
$host:"$prefix/etc"
scp ../sql/namenode.sql $host:"$prefix/etc"
scp ../src/pfs_daemon/plasmad ../src/pfs_nfs3/nfs3d ../src/pfs_admin/plasma_datanode_init "$na" \
$host:"$prefix/bin"
if [ $only_config -eq 0 ]; then
scp ../src/pfs_daemon/plasmad ../src/pfs_nfs3/nfs3d ../src/pfs_admin/plasma_datanode_init "$na" \
$host:"$prefix/bin"
fi
done
......@@ -3,6 +3,8 @@
# Creates the PostgreSQL databases. It is required that the software
# is already deployed (deploy_inst.sh).
set -e
usage () {
echo "usage: $0 [-drop] inst_name" >&2
exit 2
......@@ -10,9 +12,12 @@ usage () {
inst=""
drop=0
destroy_ok=0
while [ $# -gt 0 ]; do
case "$1" in
-I-want-to-destroy-data)
destroy_ok=1; shift ;;
-drop)
drop=1; shift ;;
-*)
......@@ -37,6 +42,12 @@ if [ ! -d "instances/$inst" ]; then
exit 2
fi
if [ $drop -ne 0 -a $destroy_ok -eq 0 ]; then
echo "Error: In order to drop a database, you need to call this script" >&2
echo "with the additional switch -I-want-to-destroy-data!" >&2
exit 2
fi
# Collect now all namenode hosts from the .hosts files and form the union set:
hosts="$(awk '
......@@ -61,8 +72,9 @@ for host in $hosts; do
else
echo "Creating database plasma_$inst on $host:"
ssh $host createdb "plasma_$inst"
ssh $host psql "plasma_$inst" "-f '$prefix/etc/namenode.sql'" -q 2>&1 |
grep -v NOTICE
ssh $host psql -v ON_ERROR_STOP=1 \
"plasma_$inst" "-f '$prefix/etc/namenode.sql'" -q 2>&1 |
grep -v NOTICE || true
fi
echo " done"
done
......@@ -5,6 +5,8 @@
# For this, the software must already deployed (deploy_inst.sh), and
# the namenode daemon must already run (rc_nn.sh)
set -e
usage () {
echo "usage: $0 inst_name size[KMG] (host host ... | all)" >&2
exit 2
......@@ -14,9 +16,12 @@ inst=""
size=""
hosts=""
all_hosts=0
destroy_ok=0
while [ $# -gt 0 ]; do
case "$1" in
-I-want-to-destroy-data)
destroy_ok=1; shift ;;
-*)
usage ;;
*)
......@@ -74,7 +79,7 @@ case "$size" in
size="$(echo "$size * 1073741824" | bc)" ;;
esac
blocks="(echo "$size / $blocksize" | bc)"
blocks="$(echo "$size / $blocksize" | bc)"
echo "Using $blocks blocks"
if [ $all_hosts -gt 0 ]; then
......@@ -92,19 +97,54 @@ if [ $all_hosts -gt 0 ]; then
fi
fi
# TODO
nn=XXX
nn_host="$(awk '
/^[:blank:]*$/ { next }
/^[:blank:]*#/ { next }
/^\w+[:blank:]*#/ { hosts[$1]=1; next }
/^\w+[:blank:]*$/ { hosts[$1]=1; next }
// { print "bad line in " FILENAME ": " $0 >"/dev/stderr"; next }
END { for (host in hosts) { print host; break } }' \
instances/$inst/namenode.hosts)"
nn="-namenode $nn_host:$nn_port -cluster $inst"
# Check $nn:
echo "Testing namenode connectivity:"
if ../src/pfs_admin/plasma_admin list_datanodes $nn >/dev/null;
then
echo "ok"
else
echo "Error: No namenode found"
echo " (hint: you may have to start the namenodes first, using rc_nn.sh)"
exit 2
fi
for host in $hosts; do
echo "Checking $host:"
have_data="$(ssh $host "if [ -f '$prefix/data/data' ]; then echo 1; else echo 0; fi")"
if [ $have_data -gt 0 -a $destroy_ok -eq 0 ]; then
echo "Error: There are already data blocks on $host. This script does not" >&2
echo "overwrite data by default. If you want to do so, and destroy the" >&2
echo "existing data blocks, call this script again with the additional" >&2
echo "switch: -I-want-to-destroy-data" >&2
exit 2
fi
echo "Initializing on $host:"
ssh $host \
"$prefix/bin/plasma_datanode_init $prefix/data $blocks"
"rm -f '$prefix/data/config' '$prefix/data/data'"
ssh $host \
"$prefix/bin/plasma_datanode_init '$prefix/data' $blocks"
id="$(ssh $host head -1 $prefix/data/config)"
echo " starting datanode on $host:"
ssh $host \
"$prefix/etc/rc_dn.sh start"
echo " adding identity to namenode"
../src/pfs_admin/plasma_admin \
add_datanode -namenode $nn -size $blocks $id
add_datanode $nn -size $blocks "$id"
echo " enabling"
../src/pfs_admin/plasma_admin \
enable_datanode -namenode $nn $id $host:2728
# FIXME: dn port number should be configurable
enable_datanode $nn "$id" "$host:$dn_port"
done
echo "Done"
(* -*- tuareg -*- *)
netplex {
controller {
socket_directory = "/tmp/plasma_nfs_<INST>";
max_level = "debug"; (* Log level *)
logging {
type = "multi_file";
directory = "<PREFIX>/log";
file {
file = "nfsnode_debug.log";
component = "*";
max_level = "debug";
};
file {
file = "nfsnode_info.log";
component = "*";
max_level = "info";
};
file {
file = "nfsnode_crit.log";
component = "*";
max_level = "crit";
};
}
};
namenodes {
clustername = "<INST>";
node_list = "namenode.hosts";
port = 2730;
};
service {
name = "Nfs3";
protocol {
name = "mount3";
address {
type = "internet";
bind = "0.0.0.0:2800"
}
};
protocol {
name = "nfs3";
address {
type = "internet";
bind = "0.0.0.0:2801"
}
};
processor {
type = "nfs";
nfs3 { };
mount3 { };
};
workload_manager {
type = "constant";
threads = 1;
};
};
}
......@@ -2,3 +2,87 @@
# rc script for datanode start/stop
thisdir="$(dirname "$0")"
. "$thisdir/global.conf"
descrname="Plasma datanode"
progname="plasmad"
logname="datanode"
sockdir="/tmp/plasma_dn_$inst"
conf="$prefix/etc/datanode.conf"
status() {
pid=`cat "$sockdir/pid" 2>/dev/null`
if kill -0 $pid 2>/dev/null; then
exe=`readlink /proc/$pid/exe | sed -e 's/ (deleted)$//'`
exe=`basename "$exe"`
if [ "X$exe" = "X$progname" ]; then
return 0
fi
fi
return 1
}
start() {
if status; then
echo `hostname`": $descrname already running"; exit 1
fi
this="$(hostname)"
ok="$(awk -v "this=$this" '
/^[:blank:]*$/ { next }
/^[:blank:]*#/ { next }
/^\w+[:blank:]*#/ { hosts[$1]=1; next }
/^\w+[:blank:]*$/ { hosts[$1]=1; next }
// { print "bad line in " FILENAME ": " $0 >"/dev/stderr"; next }
END { for (host in hosts) { if (host == this) { print "found"; break } } }' \
$thisdir/datanode.hosts)"
if [ -z "$ok" ]; then
echo "$0: This node is not a datanode!"; exit 1
fi
mkdir -p "$sockdir"
$prefix/bin/$progname \
-pid "$sockdir/pid" \
-conf "$conf" \
2>>$prefix/log/${logname}_stderr.log
}
stop() {
if ! status; then
echo `hostname`": $descrname already running"; exit 1
fi
pid="$(cat "$sockdir/pid")"
$prefix/bin/netplex-admin -sockdir "$sockdir" -shutdown
# Wait until the process group vanishes:
n=0
while kill -0 -$pid 2>/dev/null; do
sleep 0.1
n=$((n+1))
if [ $n -eq 50 ]; then
echo `hostname`",$descrname: Killing processes (hard shutdown)" >&2
kill -15 $pid
fi
if [ $n -gt 600 ]; then break; fi
done
if [ $n -gt 600 ]; then
echo `hostname`": Timing out shutdown of $descrname" >&2
exit 1
fi
}
case "$1" in
start)
start ;;
stop)
stop ;;
status)
status ;;
*)
echo "$0 (start|stop|status)" ;;
esac
#! /bin/sh
# rc script for NFS server start/stop
thisdir="$(dirname "$0")"
. "$thisdir/global.conf"
descrname="Plasma nfsnode"
progname="nfs3d"
logname="nfsd"
sockdir="/tmp/plasma_nfs_$inst"
conf="$prefix/etc/nfsnode.conf"
status() {
pid=`cat "$sockdir/pid" 2>/dev/null`
if kill -0 $pid 2>/dev/null; then
exe=`readlink /proc/$pid/exe | sed -e 's/ (deleted)$//'`
exe=`basename "$exe"`
if [ "X$exe" = "X$progname" ]; then
return 0
fi
fi
return 1
}
start() {
if status; then
echo `hostname`": $descrname already running"; exit 1
fi
this="$(hostname)"
ok="$(awk -v "this=$this" '
/^[:blank:]*$/ { next }
/^[:blank:]*#/ { next }
/^\w+[:blank:]*#/ { hosts[$1]=1; next }
/^\w+[:blank:]*$/ { hosts[$1]=1; next }
// { print "bad line in " FILENAME ": " $0 >"/dev/stderr"; next }
END { for (host in hosts) { if (host == this) { print "found"; break } } }' \
$thisdir/nfsnode.hosts)"
if [ -z "$ok" ]; then
echo "$0: This node is not an nfsnode!"; exit 1
fi
mkdir -p "$sockdir"
$prefix/bin/$progname \
-pid "$sockdir/pid" \
-conf "$conf" \
2>>$prefix/log/${logname}_stderr.log
}
stop() {
if ! status; then
echo `hostname`": $descrname already running"; exit 1
fi
pid="$(cat "$sockdir/pid")"
$prefix/bin/netplex-admin -sockdir "$sockdir" -shutdown
# Wait until the process group vanishes:
n=0
while kill -0 -$pid 2>/dev/null; do
sleep 0.1
n=$((n+1))
if [ $n -eq 50 ]; then
echo `hostname`",$descrname: Killing processes (hard shutdown)" >&2
kill -15 $pid
fi
if [ $n -gt 600 ]; then break; fi
done
if [ $n -gt 600 ]; then
echo `hostname`": Timing out shutdown of $descrname" >&2
exit 1
fi
}
case "$1" in
start)
start ;;
stop)
stop ;;
status)
status ;;
*)
echo "$0 (start|stop|status)" ;;
esac
#! /bin/sh
# rc script for namenode start/stop
thisdir="$(dirname "$0")"
. "$thisdir/global.conf"
......@@ -27,6 +29,20 @@ start() {
echo `hostname`": $descrname already running"; exit 1
fi
this="$(hostname)"
ok="$(awk -v "this=$this" '
/^[:blank:]*$/ { next }
/^[:blank:]*#/ { next }
/^\w+[:blank:]*#/ { hosts[$1]=1; next }
/^\w+[:blank:]*$/ { hosts[$1]=1; next }
// { print "bad line in " FILENAME ": " $0 >"/dev/stderr"; next }
END { for (host in hosts) { if (host == this) { print "found"; break } } }' \
$thisdir/namenode.hosts)"
if [ -z "$ok" ]; then
echo "$0: This node is not a namenode!"; exit 1
fi
mkdir -p "$sockdir"
$prefix/bin/$progname \
-pid "$sockdir/pid" \
......
......@@ -7,6 +7,8 @@ usage () {
exit 2
}
set -e
template="template"
inst=""
prefix=""
......@@ -101,7 +103,7 @@ for f in namenode.hosts datanode.hosts tasknode.hosts nfsnode.hosts global.conf
cp -p "instances/$template/$f" "instances/$inst/$f"
done
for f in namenode.conf datanode.conf; do
for f in namenode.conf datanode.conf nfsnode.conf; do
sed -e 's/<INST>/'$inst'/g' \
-e 's|<PREFIX>|'$prefix'|g' \
-e 's/<BLOCKSIZE>/'$blocksize'/g' \
......@@ -111,5 +113,7 @@ done
echo "inst=\"$inst\"" >> instances/$inst/global.conf
echo "prefix=\"$prefix\"" >> instances/$inst/global.conf
echo "blocksize=$blocksize" >> instances/$inst/global.conf
echo "nn_port=2730" >> instances/$inst/global.conf
echo "dn_port=2728" >> instances/$inst/global.conf
echo "Done"
#! /bin/sh
# Start/stop all configured services in the right order
usage () {
echo "usage: $0 (start|stop|status) inst_name" >&2
exit 2
}
op=""
inst=""
while [ $# -gt 0 ]; do
case "$1" in
-*)
usage ;;
*)
if [ -z "$op" ]; then
op="$1"
elif [ -z "$inst" ]; then
inst="$1"
else
usage
fi
shift
esac
done
if [ -z "$op" ]; then
echo "Error: missing operation";
usage
fi
case "$op" in
start|stop|status) true ;;
*)
echo "Error: bad operation: $op" >&2
exit 2 ;;
esac
if [ -z "$inst" ]; then
echo "Error: missing instance";
usage
fi
if [ ! -d "instances/$inst" ]; then
echo "Error: no such instance: $inst" >&2
exit 2
fi
case "$op" in
start|status)
echo "Datanodes:"; ./rc_dn.sh "$op" "$inst"
echo "Namenodes:"; ./rc_nn.sh "$op" "$inst"
echo "NFS nodes:"; ./rc_nfsd.sh "$op" "$inst" ;;
stop)
echo "NFS nodes:"; ./rc_nfsd.sh "$op" "$inst"
echo "Namenodes:"; ./rc_nn.sh "$op" "$inst"
echo "Datanodes:"; ./rc_dn.sh "$op" "$inst" ;;
esac
#! /bin/sh
# Start/stop datanodes
usage () {
echo "usage: $0 (start|stop|status) inst_name" >&2
exit 2
}
op=""
inst=""
while [ $# -gt 0 ]; do
case "$1" in
-*)
usage ;;
*)
if [ -z "$op" ]; then
op="$1"
elif [ -z "$inst" ]; then
inst="$1"
else
usage
fi
shift
esac
done
if [ -z "$op" ]; then
echo "Error: missing operation";
usage
fi
case "$op" in
start|stop|status) true ;;
*)
echo "Error: bad operation: $op" >&2
exit 2 ;;
esac
if [ -z "$inst" ]; then
echo "Error: missing instance";
usage
fi
if [ ! -d "instances/$inst" ]; then
echo "Error: no such instance: $inst" >&2
exit 2
fi
# Collect now all hosts from the .hosts files and form the union set:
hosts="$(awk '
/^[:blank:]*$/ { next }
/^[:blank:]*#/ { next }
/^\w+[:blank:]*#/ { hosts[$1]=1; next }
/^\w+[:blank:]*$/ { hosts[$1]=1; next }
// { print "bad line in " FILENAME ": " $0 >"/dev/stderr"; next }
END { for (host in hosts) { print host } }' \
instances/$inst/datanode.hosts)"
if [ $? -ne 0 ]; then
echo "Stopping after error" >&2
exit 2
fi
. "instances/$inst/global.conf"
for host in $hosts; do
printf "$host: "
ssh $host "$prefix/etc/rc_dn.sh $op"
if [ $? -eq 0 ]; then
echo "ok"
else
echo "error"
fi
done
#! /bin/sh
# Start/stop nfsnodes
usage () {
echo "usage: $0 (start|stop|status) inst_name" >&2
exit 2
}
op=""
inst=""
while [ $# -gt 0 ]; do
case "$1" in
-*)
usage ;;
*)
if [ -z "$op" ]; then
op="$1"
elif [ -z "$inst" ]; then
inst="$1"
else
usage
fi
shift
esac
done
if [ -z "$op" ]; then
echo "Error: missing operation";
usage
fi
case "$op" in
start|stop|status) true ;;
*)
echo "Error: bad operation: $op" >&2
exit 2 ;;
esac
if [ -z "$inst" ]; then
echo "Error: missing instance";
usage
fi
if [ ! -d "instances/$inst" ]; then
echo "Error: no such instance: $inst" >&2
exit 2
fi
# Collect now all hosts from the .hosts files and form the union set:
hosts="$(awk '
/^[:blank:]*$/ { next }
/^[:blank:]*#/ { next }
/^\w+[:blank:]*#/ { hosts[$1]=1; next }
/^\w+[:blank:]*$/ { hosts[$1]=1; next }
// { print "bad line in " FILENAME ": " $0 >"/dev/stderr"; next }
END { for (host in hosts) { print host } }' \
instances/$inst/nfsnode.hosts)"
if [ $? -ne 0 ]; then
echo "Stopping after error" >&2
exit 2
fi
. "instances/$inst/global.conf"
for host in $hosts; do
printf "$host: "
ssh $host "$prefix/etc/rc_nfsd.sh $op"
if [ $? -eq 0 ]; then
echo "ok"
else
echo "error"
fi
done
......@@ -239,7 +239,7 @@ struct announcement {
longstring ann_revision;
/* the revision number of the sender */
int ann_rank;
longstring ann_rank;
/* configured rank */
};
......
......@@ -133,6 +133,7 @@ if $(not $(defined OCAMLRPCGEN))
pfs_namenode \
pfs_admin \
plasmaclient \
pfs_daemon \
pfs_nfs3 \
mr_framework
......
......@@ -15,6 +15,30 @@ let usage() =
prerr_endline " - fsck"
let find_coord namenode clustername =
let conn =
Plasma_util.connector_of_sockaddr
(Plasma_util.sockaddr_of_host_port
(Plasma_util.parse_host_port namenode)) in
let client =
Pfs_rpcapi_clnt.Coordination.V1.create_client2
(`Socket(Rpc.Tcp, conn, Rpc_client.default_socket_config)) in
( try
let c_opt =
Pfs_rpcapi_clnt.Coordination.V1.find_coordinator client clustername in
match c_opt with
| None ->
failwith ("Cannot find coordinator for cluster " ^ clustername)
| Some hp ->
Rpc_client.shut_down client;
hp
with
| error ->
prerr_endline ("ERROR: " ^ Netexn.to_string error);
exit 2
)