CASSANDRA SF 2013
CASSANDRA
INTERNALS
Aaron Morton
@aaronmorton
www.thelastpickle.com
#Cassandra13
Licensed under a Creative Commons Attribution-NonCommercial 3.0 New Zealand License
About Me
Freelance Cassandra Consultant
Based in Wellington, New Zealand
Apache Cassandra Committer
#Cassandra13
Architecture
Code
#Cassandra13
Cassandra Architecture
API's
Cluster Aware
Cluster Unaware
Clients
Disk
#Cassandra13
Cassandra Cluster Architecture
API's
Cluster Aware
Cluster Unaware
Clients
Disk
API's
Cluster Aware
Cluster Unaware
Disk
Node 1 Node 2
#Cassandra13
Dynamo Cluster Architecture
API's
Dynamo
Database
Clients
Disk
API's
Dynamo
Database
Disk
Node 1 Node 2
#Cassandra13
Architecture
API
Dynamo
Database
#Cassandra13
APITransports
Thrift
Native Binary
#Cassandra13
ThriftTransport
//Custom TServer implementations
o.a.c.thrift.CustomTThreadPoolServer
o.a.c.thrift.CustomTNonBlockingServer
o.a.c.thrift.CustomTHsHaServer
#Cassandra13
APITransports
Thrift
Native Binary
#Cassandra13
Native BinaryTransport
Beta in Cassandra 1.2
Uses Netty
Enabled with
start_native_transport
(Disabled by default)
#Cassandra13
o.a.c.transport.Server.run()
//Setup the Netty server
new ExecutionHandler()
new NioServerSocketChannelFactory()
ServerBootstrap.setPipelineFactory()
#Cassandra13
o.a.c.transport.Message.Dispatcher.messageReceived()
//Process message from client
ServerConnection.validateNewMessage()
Request.execute()
ServerConnection.applyStateTransition()
Channel.write()
#Cassandra13
Messages
Defined in the Native Binary
Protocol
$SRC/doc/native_protocol.spec
#Cassandra13
API Services
JMX
Thrift
CQL 3
#Cassandra13
JMX Management Beans
Spread around the code base.
Interfaces named *MBean
#Cassandra13
JMX Management Beans
Registered with names such as
org.apache.cassandra.db:
type=StorageProxy
#Cassandra13
API Services
JMX
Thrift
CQL 3
#Cassandra13
o.a.c.thrift.CassandraServer
// Implements Thrift Interface
// Access control
// Input validation
// Mapping to/from Thrift and internal types
#Cassandra13
Thrift Interface
Thrift IDL
$SRC/interface/cassandra.thrift
#Cassandra13
o.a.c.thrift.CassandraServer.get_slice()
// get columns for one row
Tracing.begin()
ClientState cState = state()
cState.hasColumnFamilyAccess()
multigetSliceInternal()
#Cassandra13
CassandraServer.multigetSliceInternal()
// get columns for may rows
ThriftValidation.validate*()
// Create ReadCommands
getSlice()
#Cassandra13
CassandraServer.getSlice()
// Process ReadCommands
// return Thrift types
readColumnFamily()
thriftifyColumnFamily()
#Cassandra13
CassandraServer.readColumnFamily()
// Process ReadCommands
// Return ColumnFamilies
StorageProxy.read()
#Cassandra13
API Services
JMX
Thrift
CQL 3
#Cassandra13
o.a.c.cql3.QueryProcessor
// Prepares and executes CQL3 statements
// Used by Thrift & Native transports
// Access control
// Input validation
// Returns transport.ResultMessage
#Cassandra13
CQL3 Grammar
ANTLR Grammar
$SRC/o.a.c.cql3/Cql.g
#Cassandra13
o.a.c.cql3.statements.ParsedStatement
// Subclasses generated by ANTLR
// Tracks bound term count
// Prepare CQLStatement
prepare()
#Cassandra13
o.a.c.cql3.statements.CQLStatement
checkAccess(ClientState state)
validate(ClientState state)
execute(ConsistencyLevel cl,
QueryState state,
List<ByteBuffer> variables)
#Cassandra13
statements.SelectStatement.RawStatement
// Implements ParsedStatement
// Input validation
prepare()
#Cassandra13
statements.SelectStatement.execute()
// Create ReadCommands
StorageProxy.read()
#Cassandra13
Architecture
API
Dynamo
Database
#Cassandra13
Dynamo Layer
o.a.c.service
o.a.c.net
o.a.c.dht
o.a.c.gms
o.a.c.locator
o.a.c.stream
#Cassandra13
o.a.c.service.StorageProxy
// Cluster wide storage operations
// Select endpoints & check CL available
// Send messages to Stages
// Wait for response
// Store Hints
#Cassandra13
o.a.c.service.StorageService
// Ring operations
// Track ring state
// Start & stop ring membership
// Node & token queries
#Cassandra13
o.a.c.service.IResponseResolver
preprocess(MessageIn<T> message)
resolve() throws
DigestMismatchException
RowDigestResolver
RowDataResolver
RangeSliceResponseResolver
#Cassandra13
Response Handlers / Callback
implements IAsyncCallback<T>
response(MessageIn<T> msg)
#Cassandra13
o.a.c.service.ReadCallback.get()
//Wait for blockfor & data
condition.await(timeout,
TimeUnit.MILLISECONDS)
throw ReadTimeoutException()
resolver.resolve()
#Cassandra13
o.a.c.service.StorageProxy.fetchRows()
getLiveSortedEndpoints()
new RowDigestResolver()
new ReadCallback()
MessagingService.sendRR()
---------------------------------------
ReadCallback.get() # blocking
catch (DigestMismatchException ex)
catch (ReadTimeoutException ex)
#Cassandra13
Dynamo Layer
o.a.c.service
o.a.c.net
o.a.c.dht
o.a.c.gms
o.a.c.locator
o.a.c.stream
#Cassandra13
o.a.c.net.MessagingService.verb<<enum>>
MUTATION
READ
REQUEST_RESPONSE
TREE_REQUEST
TREE_RESPONSE
(And more...)
#Cassandra13
o.a.c.net.MessagingService.verbHandlers
new EnumMap<Verb,
IVerbHandler>(Verb.class)
#Cassandra13
o.a.c.net.IVerbHandler<T>
doVerb(MessageIn<T> message,
String id);
#Cassandra13
o.a.c.net.MessagingService.verbStages
new EnumMap<MessagingService.Verb,
Stage>(MessagingService.Verb.class)
#Cassandra13
o.a.c.net.MessagingService.receive()
runnable = new MessageDeliveryTask(
message, id, timestamp);
StageManager.getStage(
message.getMessageType());
stage.execute(runnable);
#Cassandra13
o.a.c.net.MessageDeliveryTask.run()
// If dropable and rpc_timeout
MessagingService.incrementDroppedMessag
es(verb);
MessagingService.getVerbHandler(verb)
verbHandler.doVerb(message, id)
#Cassandra13
Architecture
API Layer
Dynamo Layer
Database Layer
#Cassandra13
Database Layer
o.a.c.concurrent
o.a.c.db
o.a.c.cache
o.a.c.io
o.a.c.trace
#Cassandra13
o.a.c.concurrent.StageManager
stages = new EnumMap<Stage,
ThreadPoolExecutor>(Stage.class);
getStage(Stage stage)
#Cassandra13
o.a.c.concurrent.Stage
READ
MUTATION
GOSSIP
REQUEST_RESPONSE
ANTI_ENTROPY
(And more...)
#Cassandra13
Database Layer
o.a.c.concurrent
o.a.c.db
o.a.c.cache
o.a.c.io
o.a.c.trace
#Cassandra13
o.a.c.db.Table
// Keyspace
open(String table)
getColumnFamilyStore(String cfName)
getRow(QueryFilter filter)
apply(RowMutation mutation,
boolean writeCommitLog)
#Cassandra13
o.a.c.db.ColumnFamilyStore
// Column Family
getColumnFamily(QueryFilter filter)
getTopLevelColumns(...)
apply(DecoratedKey key,
ColumnFamily columnFamily,
SecondaryIndexManager.Updater
indexer)
#Cassandra13
o.a.c.db.IColumnContainer
addColumn(IColumn column)
remove(ByteBuffer columnName)
ColumnFamily
SuperColumn
#Cassandra13
o.a.c.db.ISortedColumns
addColumn(IColumn column,
Allocator allocator)
removeColumn(ByteBuffer name)
ArrayBackedSortedColumns
AtomicSortedColumns
TreeMapBackedSortedColumns
#Cassandra13
o.a.c.db.Memtable
put(DecoratedKey key,
ColumnFamily columnFamily,
SecondaryIndexManager.Updater
indexer)
flushAndSignal(CountDownLatch latch,
Future<ReplayPosition>
context)
#Cassandra13
o.a.c.db.ReadCommand
getRow(Table table)
SliceByNamesReadCommand
SliceFromReadCommand
#Cassandra13
o.a.c.db.IDiskAtomFilter
getMemtableColumnIterator(...)
getSSTableColumnIterator(...)
IdentityQueryFilter
NamesQueryFilter
SliceQueryFilter
#Cassandra13
Summary
CustomTThreadPoolServer Message.Dispatcher
CassandraServer QueryProcessor
ReadCommand
StorageProxy
IResponseResolver
IAsyncCallback
MessagingService
IVerbHandler
Table ColumnFamilyStore IDiskAtomFilter
API
Dynamo
Database
#Cassandra13
Thanks.
#Cassandra13
Aaron Morton
@aaronmorton
www.thelastpickle.com
Licensed under a Creative Commons Attribution-NonCommercial 3.0 New Zealand License

Cassandra SF 2013 - Cassandra Internals