Commit 5db2ce8b authored by gerd's avatar gerd

docs, build


git-svn-id: https://gps.dynxs.de/private/svn/app-plasma/trunk@235 55289a75-7b90-4627-9e07-ffb4263930b2
parent e76fe1d8
......@@ -23,8 +23,8 @@ Documentation:
- README
- GPL
- Release documentation, especially what is not yet working
- .x files
- Explanations about the transaction model
- .x files - DONE
- Protocol introduction; Explanations about the transaction model
- Use of shared memory
- Mapred instructions
- Mapred primer
......@@ -229,6 +229,13 @@ Mapred:
------------
problem ticket system: we do not distinguish between read and write
permission.
lookup: flag "repeatable"
Check: Switch to "repeatable read"?
problem namenode: should behave better when postgres connection limit
is exceeded, e.g. wait until connections become available again - DONE
......
......@@ -27,17 +27,25 @@ in user space, and can be accessed via RPC calls, or via NFS.
Client applications can only link with {b plasmasupport} and
{b plasmaclient}.
{3 [plasmasupport]: Support library}
{3 [plasmaclient]: RPC client}
{!modules:
Plasma_rng
Plasma_client
Plasma_shm
}
{3 [plasmaclient]: RPC client}
These are the mappings of the XDR definition to Ocaml (as used in
the client):
{!modules:
Plasma_shm
Plasma_client
Plasma_rpcapi_aux
Plasma_rpcapi_clnt
}
{3 [plasmasupport]: Support library}
{!modules:
Plasma_rng
}
{2 PlasmaFS RPC protocol definition}
......@@ -64,6 +72,12 @@ The following interfaces exist only within the server.
{!modules:
Pfs_db
}
These are the mappings of the XDR definition to Ocaml (as used in
the server):
{!modules:
Pfs_rpcapi_aux
Pfs_rpcapi_clnt
Pfs_rpcapi_srv
......
This diff is collapsed.
/* $Id$ -*- c -*- */
/** {1:datanode [Datanode]} */
/** Datanode access.
*/
#include "pfs_types.x"
#ifndef PFS_DATANODE_X
......@@ -7,63 +12,105 @@
program Datanode {
version V1 {
/** {2 [null] } */
void null(void) = 0;
/** {2 [identity] } */
longstring identity(longstring) = 1;
/* Returns the identity of this node (an ID which is assigned anew
/** Returns the identity of this node (an ID which is assigned anew
when the datanode is initialized). The arg is the clustername.
If the node belongs to the wrong cluster, this RPC must return
SYSTEM_ERR.
[SYSTEM_ERR].
*/
/** {2 [size] } */
hyper size(void) = 2;
/* Returns the number of blocks. The blocks have numbers from 0
to size-1
/** Returns the number of blocks. The blocks have numbers from 0
to [size-1]
*/
/** {2 [blocksize] } */
int blocksize(void) = 3;
/* Returns the blocksize */
/** Returns the blocksize */
/** {2 [clustername] } */
longstring clustername(void) = 4;
/* Returns the clustername */
/** Returns the clustername */
/** {2:read [read] } */
dn_channel_rd_data read
(dn_channel_rd_req, hyper, int, int, hyper, hyper) = 5;
/* Reads a block, or a part of it:
read(req, block, pos, len, st_id, st_vfy)
/** [read(req, block, pos, len, st_id, st_vfy)]:
Reads a block, or a part of it. [req] defines how the data
is passed back to the caller (see docs for [dn_channel_rd_req]
in {!Pfs_types}). The [block] is the block number of this
datanode. [pos] and [len] select a substring of this block.
[st_id] and [st_vfy] are the safetrans ticket returned by
[get_blocks].
Right now this ticket is not checked.
*/
/** {2:write [write] } */
void write(hyper, dn_channel_wr_data, hyper, hyper) = 6;
/* Writes a block. It is only possible to write a block completely.
Call: write(block, contents, st_id, st_vfy): Writes [contents]
to [block]. [contents] must have the length [blocksize].
[st_id] is the safetrans ID, and [st_vfy] is the verifier
as returned by the namenode.
/** [write(block, contents, st_id, st_vfy)]:
Writes a block. It is only possible to write a block completely.
The [block] is the block number of this datanode.
In [contents] the data to write is passed. (See the docs
for [dn_channel_wr_data] in {!Pfs_types} for details.)
The data in [contents] must have the length [blocksize].
[st_id] is the safetrans ID, and [st_vfy] is the verifier
as returned by the namenode.
The safetrans ticket {i is} checked!
*/
/** {2:copy [copy] } */
void copy(hyper, longstring, longstring, hyper,
hyper, hyper, hyper, hyper) = 7;
/* Copies a block, possibly to a remote system:
copy(block, dest_node, dest_identity, dest_block, st_id, st_vfy,
dest_st_id, dest_st_vfy).
If dest_identity is equal to the own identity, this is a local
copy. Otherwise, dest_node is interpreted as "host:port", and the
/** [copy(block, dest_node, dest_identity, dest_block, st_id, st_vfy,
dest_st_id, dest_st_vfy)]:
Copies a block, possibly to a remote system. [block] identifies
the block on this datanode. [dest_node] is the datanode server
to where the block is written to. [dest_identity] is the
identity of the destination server. [dest_block] is the
block number of the destination server.
If [dest_identity] is equal to the own identity, this is a local
copy. Otherwise, [dest_node] is interpreted as "host:port", and the
block is written to the remote host.
st_id, st_vfy: as in [read]
dest_st_id, dest_st_vfy: as in [write]
[st_id], [st_vfy]: as in [read]
[dest_st_id], [dest_st_vfy]: as in [write]
*/
/** {2:zero [zero] } */
void zero(hyper, hyper, hyper) = 8;
/* Fills a block with zeros: block, st_id, st_vfy */
/** [zero(block, st_id, st_vfy)]:
Fills a block with zeros
*/
/** {2:sync [sync] } */
void sync(void) = 9;
/* Waits until the next sync is done */
/** Waits until the next sync cycle is done */
/** {2:alloc_shm_if_local [alloc_shm_if_local] } */
longstring_opt alloc_shm_if_local(void) = 10;
/* If the client is on the same node, this RPC allocates a new
/** If the client is on the same node, this RPC allocates a new
POSIX shm object, and returns the path of this object.
The object has zero size, and is created with mode 666.
If the client is not on the same node, the RPC returns NULL.
......@@ -75,8 +122,10 @@ program Datanode {
insecure.
*/
/** {2:udsocket_if_local [udsocket_if_local] } */
longstring_opt udsocket_if_local(void) = 11;
/* If the client is on the same node, this RPC may return the
/** If the client is on the same node, this RPC may return the
name of a Unix Domain socket to contact instead.
*/
......
/* $Id$ */
/* $Id$ -*- c -*- */
/** Internal stuff.
*/
#ifndef PFS_DN_INTERNAL_X
#define PFS_DN_INTERNAL_X
#include "pfs_types.x"
/* The Datanode_ctrl program is only invoked by the namenode */
/** {1:datanode_ctrl [Datanode_ctrl]} */
/** The [Datanode_ctrl] program is running on each datanode, but
only invoked by the coordinator to push and revoke safetrans tickets.
*/
program Datanode_ctrl {
version V1 {
/** {2 [null] } */
void null(void) = 0;
/** {2 [reset_all_safetrans] } */
void reset_all_safetrans(void) = 1;
/* Reset all safetrans */
/** Revokes all safetrans tickets. This is called when the coordinator
starts up.
*/
/** {2 [cancel_safetrans] } */
void cancel_safetrans(hyper) = 2;
/* Cancel the safetrans of this st_id */
/** Cancel the safetrans ticket with this [st_id] */
/** {2 [safetrans] } */
void safetrans(hyper, hyper, hyper) = 3;
/* safetrans(st_id, st_tmo, st_secret) */
/** [safetrans(st_id, st_tmo, st_secret)]: Enables all safetrans
tickets with ID [st_id]. The secret [st_secret] is used
for securing the ticket system.
*/
} = 1;
} = 0x8000d002;
/* Datanode_io is internally used by the datanode: */
/** {1:datanode_io [Datanode_io]} */
/** The [Datanode_io] program is running in the I/O processes of the
datanodes
*/
program Datanode_io {
version V1 {
/** {2 [null] } */
void null(void) = 0;
/** {2 [read] } */
void read(int, hyper, int, int) = 1;
/* read(slot, block, pos, len): Reads the block (or part of it)
/** [read(slot, block, pos, len)]: Reads the block (or part of it)
to the shm [slot]
*/
/** {2 [read_shm] } */
void read_shm(dn_channel_shm_obj, hyper, int, int) = 7;
/* read_shm(shm, block, pos, len): Reads the block (or part of it)
/** [read_shm(shm, block, pos, len)]: Reads the block (or part of it)
to the passed shm object. It is an error if the oject is not
large enough.
*/
/** {2 [write] } */
void write(int, hyper) = 2;
/* write(slot, block): Writes the contents of [slot] to [block] */
/** [write(slot, block)]: Writes the contents of [slot] to [block] */
/** {2 [write_shm] } */
void write_shm(dn_channel_shm_obj, hyper) = 8;
/* write_shm(shm, block): Writes the contents of [shm] to [block].
/** [write_shm(shm, block)]: Writes the contents of [shm] to [block].
Exactly blocksize bytes are written.
*/
/** {2 [copy] } */
void copy(hyper, longstring, longstring, hyper,
hyper, hyper, int) = 3;
/* Copies a block, possibly to a remote system:
copy(block, dest_node, dest_identity, dest_block,
dest_st_id, dest_st_vfy, slot)
Also see above.
/** [copy(block, dest_node, dest_identity, dest_block,
dest_st_id, dest_st_vfy, slot)]
*/
/** {2 [sync] } */
void sync(void) = 4;
/* syncs everything to disk */
/** syncs everything to disk */
/** {2 [size] } */
hyper size(void) = 5;
/* same as in [Datanode] */
/** same as in [Datanode] */
/** {2 [identity] } */
longstring identity(void) = 6;
/* same as in [Datanode] */
/** same as in [Datanode] */
} = 1;
} = 0x8000d003;
......
/* $Id$ -*- -c -*- */
/** {1:coordination [Coordination]} */
/** Find the coordinator.
*/
/** One of the name nodes is elected to be the coordinator at cluster
startup time. Right now, all namenode requests have to go to the
coordinator. (In the future, the other namenodes may be allowed to
respond to certain read requests.)
*/
#include "pfs_types.x"
#ifndef PFS_NN_COORD
......@@ -7,38 +18,53 @@
program Coordination {
version V1 {
/** {2 [null] } */
void null(void) = 0;
/** {2:find_coordinator [find_coordinator] } */
longstring_opt find_coordinator(longstring) = 1;
/* Returns the current coordinator as host:port string. The Filesystem
/** Returns the current coordinator as host:port string. The Filesystem
requests must be sent to the coordinator.
The arg is the clustername. If this node is not part of this
cluster the RPC will return NULL.
*/
/** {2 [find_namenodes] } */
longstrings find_namenodes(longstring) = 2;
/* Return all name nodes, including the coordinator and the slaves.
/** Return all name nodes, including the coordinator and the slaves.
The arg is the clustername. If this node is not part of this
cluster the RPC will return an empty array.
*/
/** {2:is_coordinator [is_coordinator] } */
bool is_coordinator(longstring) = 3;
/* Whether this node is the coordinator
/** Whether this node is the coordinator
The arg is the clustername. If this node is not part of this
cluster the RPC will return false.
*/
/** {2 [clustername] } */
longstring clustername(void) = 4;
/* Returns the clustername */
/** Returns the clustername */
/** {2:find_inodecaches [find_inodecaches] } */
longstrings find_inodecaches(longstring) = 5;
/* Return all inodecaches for this clustername */
/** Return all inodecaches for this clustername */
/** {2 [local_identities] } */
longstrings local_identities(longstring) = 6;
/* Returns the identities of the data nodes running on the machine
/** Returns the identities of the data nodes running on the machine
of the caller. Only available data nodes are returned.
The arg is the clustername.
*/
......
/* $Id$ -*- c -*- */
/** {1:dn_admin [Dn_admin]} */
/** Administration of datanodes */
#include "pfs_types.x"
#ifndef PFS_NN_DNADMIN
......@@ -7,39 +11,48 @@
program Dn_admin {
version V1 {
/** {2 [null] } */
void null(void) = 0;
/** {2 [add_datanode] } */
void add_datanode(longstring, hyper) = 1;
/* add_datanode(identity, size):
/** [add_datanode(identity, size)]:
adds a new datanode. It is in disabled state initially
*/
void enable_datanode(longstring, longstring) = 2;
/* enable_datanode(identity, hostport): Enables the data node for
identity on hostport.
/** {2 [enable_datanode] } */
// FIXME: "At enable time, the datanode is checked whether it was enlarged." - probably another RPC
void enable_datanode(longstring, longstring) = 2;
/** [enable_datanode(identity, hostport)]: Enables the data node for
[identity] on [hostport].
*/
/** {2 [disable_datanode] } */
void disable_datanode(longstring) = 3;
/* disable_datanode(identity):
/** [disable_datanode(identity)]:
This means blocks
are not allocated on the node, and if [all=false] blocks from
this node are suppressed by get_blocks. This datanode state is
considered temporary (e.g. while moving a disk from one node to
another).
This means blocks are no longer allocated on the node, and
if blocks from this node are reported as non-alive by
[get_blocks]. This datanode state is considered temporary
(e.g. while moving a disk from one node to another).
The coordinator pings the datanodes anyway, and finds quickly
out whether nodes are down. This RPC is more useful for
administratively taking nodes down, e.g. for maintenance.
*/
/** {2 [is_enabled] } */
bool is_enabled(longstring) = 4;
/* whether this identity is enabled */
/** whether this identity is enabled */
/** {2 [lookup] } */
longstring lookup(longstring) = 5;
/* looks the identity up and returns hostport. Only for enabled
/** looks the identity up and returns hostport. Only for enabled
nodes that could be at least contacted once. The node may be
unresponsive now, though.
......@@ -49,15 +62,18 @@ program Dn_admin {
Fails if the identity is unknown.
*/
/** {2 [datanodes] } */
longstrings datanodes(void) = 6;
/* Returns the identities of all datanodes */
/** Returns the identities of all datanodes */
/** {2 [destroy_datanode] } */
void destroy_datanode(longstring) = 7;
/* destroy_datanode(identity):
/** [destroy_datanode(identity)]:
The datanode is considered permanently as non-available.
For all files storing blocks on this node the replication
number is reduced by 1.
This removes all information about this datanode from the
namenode database.
*/
} = 1;
......
This diff is collapsed.
/* $Id$ -*- c -*- */
/** {1:inodecache [Inodecache]} */
/** Quickly determine inode modifications.
*/
/** The inodecache is a helper service running on the namenodes.
Only the inodecache of the coordinator must be used.
One can get a list of available inodecache ports from the
function [find_inodecaches] in {!Pfs_nn_coord}.
The inodecache can quickly determine whether an [inodeinfo]
is still up to date, or whether the sequence number of the
[inodeinfo] is still up to date. This is faster than a regular
[get_inodeinfo] in {!Pfs_nn_fsys} because this can happen
outside a transaction, and because often no database query is
required.
The inodecache keeps the information about an inode only for
a short time. During that period, it arranges with the coordinator
that the cache is actively notified when the inode is modified
(or more exactly, when a modification is committed). Note that
there is still a small delay between this notification and the
real check, so whatever the inodecache reports, this may already
be outdated. Nevertheless, this information is meaningful when
used in the right way:
Assume you want to read the blocks of a file. You have an old
[inodeinfo] struct at hand, and an old blocklist. So how to read
blocks while ensuring they are recent? The way to do this is to {i
first} trust your old information and to read the block, and {i
then} to call the inodecache to check whether your information was
correct. If not, you have to update your information with the more
expensive [get_inodeinfo] and [get_blocks] calls, and to start
over. However, if the inodecache says the information was correct,
you know you did the right thing. The point here is that the
inodecache can only validate actions that already happened, but it
cannot give guarantees for the future.
*/
#include "pfs_types.x"
#ifndef PFS_NN_INODECACHE
#define PFS_NN_INODECACHE
program Inode_cache {
version V1 {
/** {2 [null] } */
void null(void) = 0;
/** {2 [is_up_to_date] } */
bool is_up_to_date(hyper, inodeinfo) = 1;
/* is_up_to_date(inode,ii): Checks whether ii is the current
/** [is_up_to_date(inode,ii)]: Checks whether [ii] is the current
version of the inode metadata for [inode]. Returns [true]
if so.
if this was the case at the moment the RPC was sent by
the caller.
Returns [false] if the inode is not known, if an error
occurs, or if it cannot be quickly determined that the inode
is actually up to date.
is actually up to date. So [false] does not necessarily imply
that [ii] is out of date. In this case, the client should
use alternate means of checking this.
*/
/** {2:is_up_to_date_seqno [is_up_to_date_seqno] } */
bool is_up_to_date_seqno(hyper, hyper) = 2;
/* is_up_to_date(inode,seqno): Same check but only for the
seqno
/** [is_up_to_date(inode,seqno)]: Same check but only for the
sequence number of the inode
*/
} = 1;
} = 0x8000e004;
......
/* $Id$ -*- c -*- */
/* Internal interfaces uses by the namenodes */
/** Internal interfaces used by the namenodes
*/
#include "pfs_types.x"
#ifndef PFS_NN_INTERNAL_X
#define PFS_NN_INTERNAL_X
/** {1:elect [Elect]} */
/** The election happens at cluster startup. The goal is to determine
the coordinator. Participants are all namenodes.
*/
program Elect {
version V1 {
/** {2 [null] } */
void null(void) = 0;
/** {2 [announce] } */
bool announce(announcement) = 1;
/* At cluster start the namenodes start calling [announce] of all
other namenodes - until they got a reply from each, or until
the end of the startup period is reached
/** At cluster start the namenodes start calling the
[announce] RPC of all other namenodes - until they get a
reply from each, or until the end of the startup period is
reached.
If received within the startup period, the response is [true]
if the announcement is better than the server to which it is sent.
If received after startup, the response is [false], and the
sender must not start up.
As all namenodes call [announce] of all other namenodes, the
question is whether there is a winner. If we assume there
is a total ordering between the [announcement]s, there is
a best announcement if no two namenodes emit equal announcements.
So given the announcements are all distinct, there is a winner.
*/
/** {2 [set_coordinator] } */
void set_coordinator(longstring, longstring, longstring) = 2;
/* When the end of the startup period is reached, one of the name
/** When the end of the startup period is reached, one of the name
nodes sends [set_coordinator] to all other nodes, and becomes the
coordinator. The coordinator must be eligible by all other nodes
that actually respond. Also, the coordinator must have a highest
......@@ -37,7 +58,7 @@ program Elect {
The third arg is the revision identifier.
*/
/* There is right now no provision for the case that the coordinator
/** There is right now no provision for the case that the coordinator
crashes - no other node is then automatically elected. Best is
to restart everything then.
*/
......@@ -45,40 +66,75 @@ program Elect {
} = 1;
} = 0x8000f001;
/** {1:nameslave [Nameslave]} */
/** This RPC program is activated on the non-coordinator namenodes. It
is called by the coordinator to push updates of the database.
*/
program Nameslave {
version V1 {
/* This is what the non-coordinators implement */
/** {2 [null] } */
void null(void) = 0;
/** {2 [begin_transaction] } */
void begin_transaction(longstring, longstring) = 1;
/* Begin a transaction: clustername, expected_rev.
/** Begin a transaction: clustername, expected_rev.
The 2nd arg is the expected revision string
*/
/** {2 [prepare_commit] } */
bool prepare_commit(void) = 2;
/* Result is true if the name database could be updated.
/** Result is true if the name database could be updated.
*/
/** {2 [commit] } */
void commit(void) = 3;
/* The response of [commit] is the ACK in the extended 2-phase
/** The response of [commit] is the ACK in the extended 2-phase
commit protocol
*/
/* void abort(void) = 4; */
/** Note that the names of the following RPCs correspond to
function names in {!Nn_db}:
*/
/** {2 [push_inode_ins] } */
void push_inode_ins(hyper, inodeinfo) = 7;
/** [push_inode_ins(inode, ii)] */
/** {2 [push_inode_upd] } */
void push_inode_upd(hyper, inodeinfo) = 8;
/** [push_inode_upd(inode, ii)] */
/** {2 [push_inode_upd_time] } */
void push_inode_upd_time(hyper, time_opt, time_opt) = 18;
/** [push_inode_upd_time(inode, mtime, ctime)] */
/** {2 [push_inode_del] } */
void push_inode_del(hyper) = 9;
/** [push_inode_del(inode)] */
/** {2 [push_blockalloc_upd] } */
void push_blockalloc_upd(int, hyper, longstring) = 10;
/* push_blockalloc_upd(datastore,blkidx,blkmap) */
/** [push_blockalloc_upd(datastore,blkidx,blkmap)] */
/** {2 [push_datastore_upd] } */
void push_datastore_upd(int, longstring, hyper, bool) = 11;
/* push_upd_datastore(id,identity,size,enabled): Updates the
/** [push_upd_datastore(id,identity,size,enabled)]: Updates the
datastore table. If the record is new, it is added.
The blockalloc table is updated, too: For new stores, the
......@@ -88,31 +144,52 @@ program Nameslave {
It is an error to decrease the size.
*/
/** {2 [push_datastore_del] } */
void push_datastore_del(int) = 12;
/* Deletes the datastore with this ID and all rows referencing it */
/** Deletes the datastore with this ID and all rows referencing it */
/** {2 [push_revision_upd] } */
void push_revision_upd(longstring) = 13;
/* Sets the revision id in the db */
/** Sets the revision id in the db */
/** {2 [push_inodeblocks_ins] } */
void push_inodeblocks_ins(hyper, blocklist) = 14;
/** [push_inodeblocks_ins(inode, bl)] */
/** {2 [push_inodeblocks_del] } */
void push_inodeblocks_del(hyper, hyper, hyper) = 15;
/* inode, blkidx, len */
/** [push_inodeblocks_del(inode, blkidx, len)] */