APACHECON NORTH AMERICA 2013

    CASSANDRA
    INTERNALS
                       Aaron Morton
                       @aaronmorton
                     www.thelastpickle.com




   Licensed under a Creative Commons Attribution-NonCommercial 3.0 New Zealand License
About Me
       Freelance Cassandra Consultant
      Based in Wellington, New Zealand
        Apache Cassandra Committer
     Data Stax MVP for Apache Cassandra
Architecture
   Code
Cassandra Architecture
                             Clients


                              API's


                          Cluster Aware


                         Cluster Unaware



                              Disk
Cassandra Cluster Architecture
                     Clients


                      API's             API's


                  Cluster Aware     Cluster Aware


                 Cluster Unaware   Cluster Unaware



                      Disk              Disk

                     Node 1            Node 2
Dynamo Cluster Architecture
                  Clients


                   API's       API's


                  Dynamo      Dynamo


                  Database    Database



                   Disk        Disk

                  Node 1      Node 2
Architecture
    API
 Dynamo
 Database
API Transports

                    Thrift
                 Native Binary
                  Read Line
                     RMI
Thrift Transport

   //Custom TServer implementations

   o.a.c.thrift.CustomTThreadPoolServer
   o.a.c.thrift.CustomTNonBlockingServer
   o.a.c.thrift.CustomTHsHaServer
API Transports

                     Thrift
                 Native Binary
                  Read Line
                      RMI
Native Binary Transport

         Beta in Cassandra 1.2
           Uses Netty 3.5
             Enabled with
   start_native_transport
                    (Disabled by default)
o.a.c.transport.Server.run()

   //Setup the Netty server
   new ExecutionHandler()
   new NioServerSocketChannelFactory()
   ServerBootstrap.setPipelineFactory()
o.a.c.transport.Message.Dispatcher.messageReceived()

   //Process message from client
   ServerConnection.validateNewMessage()
   Request.execute()
   ServerConnection.applyStateTransition()
   Channel.write()
o.a.c.transport.messages

   CredentialsMessage()
   EventMessage()
   ExecuteMessage()
   PrepareMessage()
   QueryMessage()
   ResultMessage()
                  (And more...)
Messages


  Defined in the Native Binary
           Protocol
 $SRC/doc/native_protocol.spec
API Services

                JMX
                CLI
               Thrift
               CQL 3
JMX Management Beans

 Spread around the code base.

   Interfaces named *MBean
JMX Management Beans

    Registered with the names
             such as
     org.apache.cassandra.db:
        type=StorageProxy
API Services

                JMX
                CLI
               Thrift
               CQL 3
o.a.c.cli.CliMain.main()

  // Connect to server to read input
  this.connect()
  this.evaluateFileStatements()
  this.processStatementInteractive()
CLI Grammar


         ANTLR Grammar
  $SRC/src/java/o/a/c/cli/CLI.g
o.a.c.cli.CliClient.executeCLIStatement()

   // Process statement
   CliCompiler.compileQuery() #ANTLR
   switch (tree.getType())
       case...
API Services

                JMX
                CLI
               Thrift
               CQL 3
o.a.c.thrift.CassandraServer

  // Implements Thrift Interface
  // Access control
  // Input validation
  // Mapping to/from Thrift and internal types
Thrift Interface


                   Thrift IDL
$SRC/interface/cassandra.thrift
o.a.c.thrift.CassandraServer.get_slice()

  // get columns for one row
  Tracing.begin()
  ClientState cState = state()
  cState.hasColumnFamilyAccess()
  multigetSliceInternal()
CassandraServer.multigetSliceInternal()

  // get columns for may rows
  ThriftValidation.validate*()
  // Create ReadCommands
  getSlice()
CassandraServer.getSlice()

  // Process ReadCommands
  // return Thrift types

  readColumnFamily()
  thriftifyColumnFamily()
CassandraServer.readColumnFamily()

  // Process ReadCommands
  // Return ColumnFamilies

  StorageProxy.read()
API Services

                JMX
                CLI
               Thrift
               CQL 3
o.a.c.cql3.QueryProcessor

  // Prepares and executes CQL3 statements
  // Used by Thrift & Native transports
  // Access control
  // Input validation
  // Returns transport.ResultMessage
CQL3 Grammar


         ANTLR Grammar
       $SRC/o.a.c.cql3/Cql.g
o.a.c.cql3.statements.ParsedStatement

  // Subclasses generated by ANTLR
  // Tracks bound term count
  // Prepare CQLStatement
  prepare()
o.a.c.cql3.statements.CQLStatement

  checkAccess(ClientState state)
  validate(ClientState state)
  execute(ConsistencyLevel cl,
          QueryState state,
          List<ByteBuffer> variables)
o.a.c.cql3.functions.Function

  argsType()
  returnType()
  execute(List<ByteBuffer>
          parameters)
statements.SelectStatement.RawStatement

  // Implements ParsedStatement
  // Input validation
  prepare()
statements.SelectStatement.execute()

  // Create ReadCommands
  StorageProxy.read()
Architecture
    API
 Dynamo
 Database
Dynamo Layer
               o.a.c.service
                 o.a.c.net
                 o.a.c.dht
               o.a.c.locator
                 o.a.c.gms

               o.a.c.stream
o.a.c.service.StorageProxy

  // Cluster wide storage operations
  // Select endpoints & check CL available
  // Send messages to Stages
  // Wait for response
  // Store Hints
o.a.c.service.StorageService

  // Ring operations
  // Track ring state
  // Start & stop ring membership
  // Node & token queries
o.a.c.service.IResponseResolver

  preprocess(MessageIn<T> message)
  resolve() throws
   DigestMismatchException

  RowDigestResolver
  RowDataResolver
  RangeSliceResponseResolver
Response Handlers / Callback

  implements IAsyncCallback<T>

  response(MessageIn<T> msg)
o.a.c.service.ReadCallback.get()

  //Wait for blockfor & data
  condition.await(timeout,
   TimeUnit.MILLISECONDS)

  throw ReadTimeoutException()

  resolver.resolve()
o.a.c.service.StorageProxy.fetchRows()

  getLiveSortedEndpoints()
  new RowDigestResolver()
  new ReadCallback()
  MessagingService.sendRR()
  ---------------------------------------
  ReadCallback.get() # blocking
  catch (DigestMismatchException ex)
  catch (ReadTimeoutException ex)
Dynamo Layer
               o.a.c.service
                 o.a.c.net
                 o.a.c.dht
               o.a.c.locator
                o.a.c.gms

               o.a.c.stream
o.a.c.net.MessagingService.verb<<enum>>

  MUTATION
  READ
  REQUEST_RESPONSE
  TREE_REQUEST
  TREE_RESPONSE
                (And more...)
o.a.c.net.MessagingService.verbHandlers


  new EnumMap<Verb,
     IVerbHandler>(Verb.class)
o.a.c.net.IVerbHandler<T>

  doVerb(MessageIn<T> message,
         String id);
o.a.c.net.MessagingService.verbStages

  new EnumMap<MessagingService.Verb,
      Stage>(MessagingService.Verb.class)
o.a.c.net.MessagingService.receive()

  runnable = new MessageDeliveryTask(
    message, id, timestamp);

  StageManager.getStage(
    message.getMessageType());

  stage.execute(runnable);
o.a.c.net.MessageDeliveryTask.run()

  // If dropable and rpc_timeout
  MessagingService.incrementDroppedMessag
es(verb);

  MessagingService.getVerbHandler(verb)
  verbHandler.doVerb(message, id)
Dynamo Layer
               o.a.c.service
                 o.a.c.net
                 o.a.c.dht
               o.a.c.locator
                o.a.c.gms

               o.a.c.stream
o.a.c.dht.IPartitioner<T extends Token>

  getToken(ByteBuffer key)
  getRandomToken()

  LocalPartitioner
  RandomPartitioner
  Murmur3Partitioner
o.a.c.dht.Token<T>

  compareTo(Token<T> o)

  BytesToken
  BigIntegerToken
  LongToken
Dynamo Layer
               o.a.c.service
                 o.a.c.net
                 o.a.c.dht
               o.a.c.locator
                 o.a.c.gms

               o.a.c.stream
o.a.c.locator.IEndpointSnitch

  getRack(InetAddress endpoint)
  getDatacenter(InetAddress endpoint)
  sortByProximity(InetAddress address,
   List<InetAddress> addresses)

  SimpleSnitch
  PropertyFileSnitch
  Ec2MultiRegionSnitch
o.a.c.locator.AbstractReplicationStrategy

  getNaturalEndpoints(
      RingPosition searchPosition)
  calculateNaturalEndpoints(Token
    searchToken, TokenMetadata
    tokenMetadata)

  SimpleStrategy
  NetworkTopologyStrategy
o.a.c.locator.TokenMetadata

  BiMultiValMap<Token, InetAddress>
      tokenToEndpointMap
  BiMultiValMap<Token, InetAddress>
      bootstrapTokens
  Set<InetAddress> leavingEndpoints
Dynamo Layer
               o.a.c.service
                 o.a.c.net
                 o.a.c.dht
               o.a.c.locator
                o.a.c.gms

               o.a.c.stream
o.a.c.gms.VersionedValue

  // VersionGenerator.getNextVersion()

  public final int version;
  public final String value;
o.a.c.gms.ApplicationState<<enum>>

  STATUS
  LOAD
  SCHEMA
  DC
  RACK
                 (And more...)
o.a.c.gms.HeartBeatState

  //VersionGenerator.getNextVersion();

  private int generation;
  private int version;
o.a.c.gms.Gossiper.GossipTask.run()

  // SYN -> ACK -> ACK2
  makeRandomGossipDigest()
  new GossipDigestSyn()

  // Use MessagingService.sendOneWay()
  Gossiper.doGossipToLiveMember()
  Gossiper.doGossipToUnreachableMember()
  Gossiper.doGossipToSeed()
gms.GossipDigestSynVerbHandler.doVerb()

  Gossiper.examineGossiper()
  new GossipDigestAck()
  MessagingService.sendOneWay()
gms.GossipDigestAck2VerbHandler.doVerb()

  Gossiper.notifyFailureDetector()
  Gossiper.applyStateLocally()
Architecture
  API Layer
Dynamo Layer
Database Layer
Database Layer
                 o.a.c.concurrent
                      o.a.c.db

                   o.a.c.cache
                     o.a.c.io
                   o.a.c.trace
o.a.c.concurrent.StageManager

  stages = new EnumMap<Stage,
    ThreadPoolExecutor>(Stage.class);

  getStage(Stage stage)
o.a.c.concurrent.Stage


  READ
  MUTATION
  GOSSIP
  REQUEST_RESPONSE
  ANTI_ENTROPY
                (And more...)
Database Layer
                 o.a.c.concurrent
                      o.a.c.db

                   o.a.c.cache
                     o.a.c.io
                   o.a.c.trace
o.a.c.db.Table

  // Keyspace
  open(String table)
  getColumnFamilyStore(String cfName)

  getRow(QueryFilter filter)
  apply(RowMutation mutation,
       boolean writeCommitLog)
o.a.c.db.ColumnFamilyStore

  // Column Family
  getColumnFamily(QueryFilter filter)
  getTopLevelColumns(...)

  apply(DecoratedKey key,
       ColumnFamily columnFamily,
       SecondaryIndexManager.Updater
       indexer)
o.a.c.db.IColumnContainer

  addColumn(IColumn column)
  remove(ByteBuffer columnName)

  ColumnFamily
  SuperColumn
o.a.c.db.ISortedColumns

  addColumn(IColumn column,
            Allocator allocator)
  removeColumn(ByteBuffer name)

  ArrayBackedSortedColumns
  AtomicSortedColumns
  TreeMapBackedSortedColumns
o.a.c.db.Memtable

  put(DecoratedKey key,
      ColumnFamily columnFamily,
      SecondaryIndexManager.Updater
      indexer)

  flushAndSignal(CountDownLatch latch,
                 Future<ReplayPosition>
                 context)
Memtable.FlushRunnable.writeSortedContent
s()

  // SSTableWriter
  createFlushWriter()

  // Iterate through rows & CF’s in order
  writer.append()
o.a.c.db.ReadCommand

  getRow(Table table)

  SliceByNamesReadCommand
  SliceFromReadCommand
o.a.c.db.IDiskAtomFilter

  getMemtableColumnIterator(...)
  getSSTableColumnIterator(...)

  IdentityQueryFilter
  NamesQueryFilter
  SliceQueryFilter
Thanks.
Aaron Morton
                     @aaronmorton
                   www.thelastpickle.com




Licensed under a Creative Commons Attribution-NonCommercial 3.0 New Zealand License

Apache Con NA 2013 - Cassandra Internals