CASSANDRA SUMMIT SF 2014 
CONTRIBUTOR BOOT 
CAMP 
Aaron Morton 
@aaronmorton 
Co-Founder & Principal Consultant 
Licensed under a Creative Commons Attribution-NonCommercial 3.0 New Zealand License
Architecture 
Startup, Shutdown & Failure 
StorageProxy 
MessagingService 
Gossip
Dynamo Cluster Architecture 
Clients 
API's 
Dynamo 
Database 
Disk 
API's 
Dynamo 
Database 
Disk 
Node 1 Node 2
API Layer 
o.a.c.auth 
o.a.c.cql3 
o.a.c.metrics 
o.a.c.thrift 
o.a.c.transport
API Layer 
Talks to Dynamo layer using 
Commands via the 
StorageProxy
Dynamo Layer 
o.a.c.dht 
o.a.c.gms 
o.a.c.locator 
o.a.c.net 
o.a.c.repair 
o.a.c.service 
o.a.c.streaming
Dynamo Layer 
Talks to Database layer by 
sending messages to 
IVerbHandler’s via the 
MessagingService.
Database Layer 
o.a.c.cache 
o.a.c.concurrent 
o.a.c.db 
o.a.c.io 
o.a.c.serializers
Global Services 
o.a.c.config 
o.a.c.trace 
o.a.c.utils
Architecture 
Startup, Shutdown & Failure 
StorageProxy 
MessagingService 
Gossip
o.a.c.service.CassandraDaemon.main() 
! 
// Singleton 
// Start MBean 
setup() // here be magic
o.a.c.service.CassandraDaemon.setup() 
// JNA 
Thread.setDefaultUncaughtExceptionHandler() 
// Check directories exist 
SystemKeyspace.checkHealth(); 
DatabaseDescriptor.loadSchemas(); 
CFS.disableAutoCompaction(); 
!
o.a.c.service.CassandraDaemon.setup() 
CommitLog.recover(); 
StorageService.registerDaemon(); 
StorageService.initServer();
Exception Hook 
! 
// Exception Metrics 
! 
FileUtils.handleFSError() 
FileUtils.handleCorruptSSTable()
Shutdown and Drain Hook 
! 
// Shutdown client transports 
// Shutdown thread pools 
// Blocking flush to disk 
// Shutdown commit log 
!
Architecture 
Startup, Shutdown & Failure 
StorageProxy 
MessagingService 
Gossip
o.a.c.service.StorageProxy 
! 
// Cluster wide storage operations 
// Select endpoints & check CL available 
// Send messages to Stages 
// Wait for response 
// Store Hints
o.a.c.service.IResponseResolver 
! 
preprocess(MessageIn<T> message) 
resolve() throws DigestMismatchException 
! 
RowDigestResolver 
RowDataResolver 
RangeSliceResponseResolver
Response Handlers / Callback 
! 
implements IAsyncCallback<T> 
! 
response(MessageIn<T> msg) 
!
o.a.c.service.ReadCallback.get() 
! 
//Wait for blockfor & data 
condition.await(timeout, 
TimeUnit.MILLISECONDS) 
! 
// if condition not set 
throw ReadTimeoutException() 
! 
resolver.resolve()
o.a.c.service.StorageProxy.fetchRows() 
! 
AbstractReadExecutor.getReadExecutor() 
exec.executeAsync(); 
exec.maybeTryAdditionalReplicas(); 
--------------------------------------- 
AbstractReadExecutor.get() //handler.get 
catch (DigestMismatchException ex) 
catch (ReadTimeoutException ex)
AbstractReadExecutor.getReadExecutor() 
! 
StorageProxy.getLiveSortedEndpoints() 
CFMetaData.newReadRepairDecision() 
ConsistencyLevel.filterForQuery() 
ConsistencyLevel.assureSufficientLiveNodes() 
…
AbstractReadExecutor.getReadExecutor() 
! 
// no retry or blocking for all replicas 
return new NeverSpeculatingReadExecutor() 
! 
// always retry or targeting all replicas 
return new AlwaysSpeculatingReadExecutor() 
! 
// otherwise 
return new SpeculatingReadExecutor()
AbstractReadExecutor() 
! 
resolver = new RowDigestResolver() 
handler = new ReadCallback<>()
AbstractReadExecutor.executeAsync() 
// makeDataRequests 
MessagingService.sendRR(command.createMessage(), endpoint, 
handler); 
! 
// makeDigestRequests 
ReadCommand digestCommand = command.copy(); 
digestCommand.setDigestQuery(true); 
MessageOut<?> message = digestCommand.createMessage(); 
MessagingService.instance().sendRR(message, endpoint, 
handler);
StorageProxy.mutateAtomically() 
! 
wrapResponseHandler() 
AbstractWriteResponseHandler.assureSufficientLiveNodes() 
! 
----------------------------------------------------- 
getBatchlogEndpoints() 
syncWriteToBatchlog() // all mutations 
syncWriteBatchedMutations() // all wrappers 
asyncRemoveFromBatchlog() 
! 
catch (UnavailableException e) 
catch (WriteTimeoutException e)
StorageProxy.wrapResponseHandler() 
! 
StorageService.getNaturalEndpoints() 
TokenMetadata.pendingEndpointsFor() 
AbstractReplicationStrategy.getWriteResponseHandler() 
----------------------------------------- 
! 
// AbstractWriteResponseHandler 
WriteResponseHandler 
DatacenterWriteResponseHandler 
DatacenterSyncWriteResponseHandler 
ReplayWriteResponseHandler
StorageProxy.syncWriteBatchedMutations() 
! 
// write to natural and pending endpoints 
sendToHintedEndpoints() 
! 
--------------------------------------- 
! 
AbstractWriteResponseHandler.get()
StorageProxy.sendToHintedEndpoints() 
// loop all targets 
MessagingService.sendRR() // for local 
! 
// group messages for remote DC’s 
dcGroups.get(dc).add(destination) 
! 
// write hints for down nodes 
submitHint() 
--------------------------------------- 
! 
sendMessagesToNonlocalDC()
Architecture 
Startup, Shutdown & Failure 
StorageProxy 
MessagingService 
Gossip
MessagingService Transport Layer 
Custom Serialisation over TCP 
Sockets. 
Serialisers spread around 
code.
o.a.c.net.MessagingService.verb<<enum>> 
! 
MUTATION 
READ 
REQUEST_RESPONSE 
TREE_REQUEST 
TREE_RESPONSE 
(And more...)
o.a.c.net.MessagingService.verbHandlers 
! 
new EnumMap<Verb, 
IVerbHandler>(Verb.class)
o.a.c.net.IVerbHandler<T> 
! 
doVerb(MessageIn<T> message, String id); 
!
o.a.c.net.MessageIn<T> 
public class MessageIn<T> 
{ 
public final InetAddress from; 
public final T payload; 
public final Map<String, byte[]> parameters; 
public final MessagingService.Verb verb; 
public final int version; 
… 
}
o.a.c.net.MessageOut<T> 
public class MessageOut<T> 
{ 
public final InetAddress 
public final MessagingService.Verb verb; 
public final T payload; 
public final IVersionedSerializer<T> 
serializer; 
public final Map<String, byte[]> parameters; 
… 
}
o.a.c.net.MessagingService.verbStages 
! 
new EnumMap<MessagingService.Verb, 
Stage>(MessagingService.Verb.class)
o.a.c.net.MessagingService.verbStages 
! 
put(Verb.MUTATION, Stage.MUTATION); 
put(Verb.READ, Stage.READ); 
put(Verb.REQUEST_RESPONSE, 
Stage.REQUEST_RESPONSE);
o.a.c.net.MessagingService.receive() 
! 
runnable = new MessageDeliveryTask( 
message, id, timestamp); 
! 
StageManager.getStage( 
message.getMessageType()); 
! 
stage.execute(runnable);
o.a.c.net.MessageDeliveryTask.run() 
! 
// If dropable and rpc_timeout 
MessagingService.incrementDroppedMessages(verb 
); 
! 
MessagingService.getVerbHandler(verb) 
verbHandler.doVerb(message, id)
Architecture 
Startup, Shutdown & Failure 
StorageProxy 
MessagingService 
Gossip
o.a.c.gms.ApplicationState 
! 
STATUS, 
LOAD, 
SCHEMA, 
DC, 
RACK, 
RELEASE_VERSION, 
REMOVAL_COORDINATOR, 
INTERNAL_IP, 
RPC_ADDRESS, 
SEVERITY, 
NET_VERSION …
o.a.c.gms.VersionedValue 
! 
public final int version; 
public final String value;
o.a.c.gms.VersionGenerator 
{ 
private static final AtomicInteger version = new 
AtomicInteger(0); 
! 
public static int getNextVersion() 
{ 
return version.incrementAndGet(); 
} 
}
o.a.c.gms.EndpointState 
{ 
private volatile HeartBeatState hbState; 
final Map<ApplicationState, VersionedValue> applicationState = new 
NonBlockingHashMap<ApplicationState, VersionedValue>(); 
! 
}
o.a.c.gms.HeartBeatState 
{ 
private int generation; 
private int version; 
… 
}
o.a.c.db.SystemKeyspace.incrementAndGetGeneration() 
SELECT gossip_generation FROM system.local WHERE key=‘local’; 
! 
// if none 
generation = (int) (System.currentTimeMillis() / 1000); 
! 
// else 
generation = (int) (System.currentTimeMillis() / 1000); 
// and some other checks
nodetool gossipinfo 
generation:1410220170 
heartbeat:37 
LOAD:1.57821104E8 
STATUS:NORMAL,-1007384361686170050 
RACK:rack1 
NET_VERSION:8 
SEVERITY:0.0 
RELEASE_VERSION:2.1.0-rc5 
SCHEMA:f3b70c8e-a904-3de9-ac5d-8ab30271441d 
HOST_ID:4aac20b5-3c68-4a26-a415-2e2f2ff0ed46 
RPC_ADDRESS:127.0.0.1
o.a.c.gms.Gossiper.GossipTask.run() 
Gossip every second. 
1 to 3 nodes. 
! 
Three step process.
Processed by IVerbHandlers 
I Send SYN. 
Remote replies with ACK. 
I send ACK2.
o.a.c.gms.GossipDigestSyn 
Exchange List<GossipDigest> 
! 
GossipDigest 
{ 
final InetAddress endpoint; 
final int generation; 
final int maxVersion; 
… 
}
o.a.c.gms.Gossiper.examineGossiper() 
// If empty SYN send all my info (shadow gossip) 
! 
if (remoteGeneration == localGeneration && maxRemoteVersion 
== maxLocalVersion) 
// do nothing 
! 
else if (remoteGeneration > localGeneration) 
// we request everything from the gossiper 
! 
else if (remoteGeneration < localGeneration) 
// send all data with generation = localgeneration and version >
o.a.c.gms.Gossiper.examineGossiper() 
else if (remoteGeneration == localGeneration) 
! 
/* 
If the max remote version is greater then we request the remote 
endpoint send us all the data for this endpoint with version greater 
than the max version number we have locally for this endpoint. 
! 
If the max remote version is lesser, then we send all the data we 
have locally for this endpoint with version greater than the max 
remote version. 
*/
Thanks. 
!
Aaron Morton 
@aaronmorton 
! 
Co-Founder & Principal Consultant 
www.thelastpickle.com 
! 
Licensed under a Creative Commons Attribution-NonCommercial 3.0 New Zealand License

Cassandra 2.1 boot camp, Overview