SlideShare a Scribd company logo
1 of 109
Download to read offline
Hadoop
3    Hadoop
Hadoop

•
    -
    -
    -

•   HDFS(Hadoop Distributed Filesystem)
HDFS

•
    -
        ‣   MB, GB, TB

    -
        ‣
    -
        ‣
        ‣
HDFS

•
    -
        ‣
    -
        ‣
    -
        ‣
        ‣
HDFS

•
    -        64MB




    -
        ‣
        ‣
        ‣
HDFS

•
    -   /

    -       (   )

    -       (   )
HDFS

•
    -
               (           )

    -              (
                       )
HDFS

•
    -
    -

    -
HDFS

•

    -          (   ,   )

    -
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
•   hadoop fs -copyFromLocal <localsrc> ... <dst>

•   hadoop fs -copyToLocal <src> <localdst>

•   hadoop fs -ls <path>

•   hadoop fs -mkdir <path>


•   hadoop fs -help
Hadoop

                        •hadoop fs -ls file:///
                        •hadoop fs -ls hdfs:///
                        •hadoop fs -ls hftp:///
                         URI
                                                    java

        local             file          org.apache.hadoop.fs.localFileSystem
        HDFS             hdfs     org.apache.hadoop.hdfs.DistributesFileSystem
        HFTP             hftp         org.apache.hadoop.hdfs.HftpFileSystem
        HSFTP            hsftp       org.apache.hadoop.hdfs.HsftpFileSystem
        HAR               har          org.apache.hadoop.fs.HarFileSystem
         KFS              kfs       org.apache.hadoop.fs.kfs.KosmosFileSystem
         FTP              ftp         org.apache.hadoop.fs.ftp.FTPFileSystem
         S3
                         s3n     org.apache.hadoop.fs.s3native.NativeS3FileSystem
    (           )
         S3
                          s3            org.apache.hadoop.fs.S3FileSystem
(                   )
•   Thrift

•   C
    -   libhdfs




•   FUSE(FileSystem in Userspace)

•   WebDAV

•
    -   HTTP, FTP(           )
Java

    •   Hadoop URL

public class URLCat {
	 static {
	 	 URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory());
	 }

	   public static void main(String[] args) throws Exception {
	   	 InputStream in = null;
	   	 try {
	   	 	 in = new URL(args[0]).openStream();
	   	 	 IOUtils.copyBytes(in, System.out, 4096, false);
	   	 } finally {
	   	 	 IOUtils.closeStream(in);
	   	 }
	   }
}
Java

•   FileSystem API

    public class FileSystemCat {
    	 public static void main(String[] args) throws Exception {
    	 	 String uri = args[0];
    	 	 Configuration conf = new Configuration();
    	 	 FileSystem fs = FileSystem.get(URI.create(uri), conf);
    	 	 InputStream in = null;
    	 	 try {
    	 	 	 in = fs.open(new Path(uri));
    	 	 	 IOUtils.copyBytes(in, System.out, 4096, false);
    	 	 } finally {
    	 	 	 IOUtils.closeStream(in);
    	 	 }
    	 }
Java

•       FSDataInputStream
    public class FSDataInputStream extends DataInputStream
        implements Seekable, PositionedReadable {

          //
    }




        public interface Seekable {
            void seek(long pos) throws IOException;
            long getPos() throws IOException;
            boolean seekToNewSource(long targetPos) throws IOException;
        }
Java

  •   FSDataInputStream

public class FileSystemDoubleCat {
	 public static void main(String[] args) throws Exception {
	 	 String uri = args[0];
	 	 FileSystem fs = FileSystem.get(URI.create(uri), new Configuration());
	 	 FSDataInputStream in = null;
	 	 try {
	 	 	 in = fs.open(new Path(uri));
	 	 	 IOUtils.copyBytes(in, System.out, 4096, false);
	 	 	 in.seek(0);
	 	 	 IOUtils.copyBytes(in, System.out, 4096, false);
	 	 } finally {
	 	 	 IOUtils.closeStream(in);
	 	 }
	 }
}
Java

•       FSDataInputStream
    public class FSDataInputStream extends DataInputStream
        implements Seekable, PositionedReadable {

         //
    }




public interface PositionedReadable {
    int read(long position, byte buffer[], int offset, int length)
    throws IOException;
    void readFully(long position, byte buffer[], int offset, int length)
    throws IOException;
    void readFully(long position, byte buffer[]) throws IOException;
}
Java

•
    -   public FSDataOutputStream create(Path f)
        throws IOException

    -   public FSDataOutputStream append(Path f)
        throws IOException
Java

•   FSDateOutputStream
    -   FileSystem   create(), append()

    -

        public class FSDataOutputStream extends DataOutputStream
            implements Syncable {

            public long getPos() throws IOException {
                //
            }

            //
        }
Java

•
    -   public boolean mkdirs(Path f) throws IOException
Java

  •
FileStatus status = fs.getFileStatus(new Path("hdfs://localhost/hogehoge"));

status.isDir(); //
status.getLen();     //
status.getModificationTime();            //
status.getReplication();            //
status.getBlockSize();         //                  (   64MB)
status.getOwner();        //
status.getGroup();        //
status.getPermission().toString();            //
Java

•
    -   public FileStatus[] listStatus(Path f) throws IOException;

    -   public FileStatus[] listStatus(Path f, PathFilter filter)
        throws IOException;

    -   public FileStatus[] listStatus(Path[] files)
        throws IOException;

    -   public FileStatus[] listStatus(Path[] files, PathFilter filter)
        throws IOException;
Java

•
    public class ListStatus {
    	 public static void main(String[] args) throws Exception {
    	 	 String uri = args[0];
    	 	 Configuration conf = new Configuration();
    	 	 FileSystem fs = FileSystem.get(URI.create(uri), conf);
    	 	
    	 	 Path[] paths = new Path[args.length];
    	 	 for (int i = 0; i < paths.length; i++) {
    	 	 	 paths[i] = new Path(args[i]);
    	 	 }
    	 	
    	 	 FileStatus[] status = fs.listStatus(paths);
    	 	 for (FileStatus stat : status) {
    	 	 	 System.out.println(stat.getPath().toUri().getPath());
    	 	 }
    	 }
    }
Java

•
    -   public FileStatus[] globStatus(Path pathPattern) throws IOException

    -   public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
        throws IOException
Java

•


    [ab]                        {a,b}


    [^ab]                       {a,b}

                            {a,b}           (a b       )
    [a-b]
                    a       b
                    {a,b}           (a b           )       a   b
    [^a-b]

    {a,b}                               a    b


     ¥c                     c                      c
Java

•
    public interface PathFilter {
        boolean accept(Path path);
    }
Java

   •
         public class RegexExcludePathFilter implements PathFilter {

         	   private final String regex;
         	
         	   public RegexExcludePathFilter(String regex) {
         	   	 this.regex = regex;
         	   }
         	
         	   @Override
         	   public boolean accept(Path path) {
         	   	 return !path.toString().matches(regex);
         	   }
         }




fs.globStatus(new Path("/2007/*/*"), new RegexExcludePathFilter("^.*/2007/12/31$"));
Java

•
    -   public boolean delete(Path f, boolean recursive)
        throws IOException;
•
HDFS       DistributedFileSystem                              NameNode




            FSDataInputStream




               DateNode1             DateNode2    DateNode3              DateNode4



            block1                 block3        block1              block2

            block4                 block4        block2              block3
•
           open(new Path(“/aaa.txt”))
HDFS                                    DistributedFileSystem                              NameNode




                                         FSDataInputStream




                                            DateNode1             DateNode2    DateNode3              DateNode4



                                         block1                 block3        block1              block2

                                         block4                 block4        block2              block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (          )
HDFS                                    DistributedFileSystem                                     NameNode




                                         FSDataInputStream




                                            DateNode1                DateNode2        DateNode3              DateNode4



                                         block1                   block3             block1              block2

                                         block4                   block4             block2              block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (          )
HDFS                                    DistributedFileSystem                                     NameNode


                                                                                                   aaa.txt : block1, block2. block3, block4

                                                                                                   block1 : DataNode1, DataNode3
                                                                                                   block2 : DataNode3, DataNode4
                                         FSDataInputStream                                         block3 : DataNode2, DataNode3
                                                                                                   block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2        DateNode3                 DateNode4



                                         block1                   block3             block1                  block2

                                         block4                   block4             block2                  block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                   )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                 block3 : DataNode2, DataNode3      block3 : DataNode2, DataNode3
                                                                           block4 : DataNode1, DataNode2      block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                       block1                   block2

                                         block4                   block4                       block2                   block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                   )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                 block3 : DataNode2, DataNode3      block3 : DataNode2, DataNode3
                                                                           block4 : DataNode1, DataNode2      block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                       block1                   block2

                                         block4                   block4                       block2                   block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                   )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                 block3 : DataNode2, DataNode3      block3 : DataNode2, DataNode3
                                                                           block4 : DataNode1, DataNode2      block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                       block1                   block2

                                         block4                   block4                       block2                   block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                   )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                         read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

               close()                                                                                        block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                 block3 : DataNode2, DataNode3      block3 : DataNode2, DataNode3
                                                                           block4 : DataNode1, DataNode2      block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                       block1                   block2

                                         block4                   block4                       block2                   block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))                                 (                )
HDFS                                    DistributedFileSystem                                                NameNode

                                                                 block1 : DataNode1, DataNode3
                        read()
                                                                 block2 : DataNode3, DataNode4                aaa.txt : block1, block2. block3, block4

                                                                                                              block1 : DataNode1, DataNode3
                                                                                                              block2 : DataNode3, DataNode4
                                         FSDataInputStream                                                    block3 : DataNode2, DataNode3
                                                                                                              block4 : DataNode1, DataNode2




                                            DateNode1                DateNode2                   DateNode3                 DateNode4



                                         block1                   block3                     block1                     block2

                                         block4                   block4                     block2                     block3
•
    -
    -
    -
•
    -
    -                  9.1.1

    -
                                    (/d1/r1/n1, /d1/r1/n1) = 0
                  d1           d2   (/d1/r1/n1, /d1/r1/n2) = 2

                                    (/d1/r1/n1, /d1/r2/n3) = 4
        r1             r2      r3
                                    (/d1/r1/n1, /d2/r3/n4) = 6

n1           n2        n3      n4
•
HDFS       DistributedFileSystem                           NameNode




           FSDataOutputStream




               DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode




                                          FSDataOutputStream




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode




                                          FSDataOutputStream




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode




                                          FSDataOutputStream




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode

                       write()



                                          FSDataOutputStream




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode

                       write()



                                          FSDataOutputStream




                           ack




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode

                       write()



                                          FSDataOutputStream




                           ack




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                           NameNode

                       write()



                                          FSDataOutputStream




                           ack




                                              DateNode1           DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                              NameNode

                       write()



                                          FSDataOutputStream                       block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2                DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                              NameNode

                       write()



                                          FSDataOutputStream                       block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2                DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                              NameNode

                       write()



                                          FSDataOutputStream                       block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2                DateNode3


                                            block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()



                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()


                  close()
                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                            ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                                NameNode

                       write()


                  close()
                                          FSDataOutputStream                         block1 : DataNode1, DataNode2, DataNode3

                                                        DataStreamer




                            ack




                                              DateNode1                  DateNode2                DateNode3


                                            block1                     block1                  block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2   DateNode3
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2   DateNode3


                                            block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block2
                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block2
                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                  NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2   DateNode3


                                            block2
                                            block1                     block1
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block1        block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                   NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2      DateNode3


                                            block2
                                            block1                                 block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                   NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                DateNode2      DateNode3


                                            block2
                                            block1                                 block2
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                                     NameNode

                       write()



                                          FSDataOutputStream

                                                        DataStreamer




                           ack




                                              DateNode1                  DateNode2      DateNode3


                                            block2
                                            block1                     block2        block2
•
    -                      dfs.replication.min(        1)



    -   (dfs.replication                          3)




    -
•
    1.       (       )

    2.

    3.

    4.   (       )
•
    -

             fs.create(new Path("p"));




    -

        OutputStream out = fs.create(new Path("p"));
        out.write("content".getBytes("UTF-8"));
        out.flush();
•
    -   FSDataOutputStream sync()

    -   sync()   close()


                 FSDataOutputStream out = fs.create(new Path("p"));
                 out.write("content".getBytes("UTF-8"));
                 out.flush();
                 out.sync();
•
    -
        ‣   sync()

        ‣            sync()

        ‣   sync()
distcp

•   2        HDFS

    -   hadoop distcp hdfs://namenode1/foo hdfs://namenode2/bar

    -   hadoop distcp -overwrite hdfs://namenode1/foo hdfs://namenode2/bar/foo

    -   hadoop distcp -update hdfs://namenode1/foo hdfs://namenode2/bar/foo


•   MapReduce

    -                   256MB                 (1GB                 4             )

    -                                   map             (
                        )

    -                map        1        (tasktraker)             20map
Hadoop

•

•   HAR


•   hadoop archive -archiveName files.har /my/files /my
Hadoop

•
    -
                                             (
                )

    -
    -   HAR            MapReduce
                                     (   7.2.1.4
        CombineFileInputFormat   )
•   HDFS
    -
    -
    -
    -


•   distcp

•   HAR
•

•

More Related Content

What's hot

Perl for System Automation - 01 Advanced File Processing
Perl for System Automation - 01 Advanced File ProcessingPerl for System Automation - 01 Advanced File Processing
Perl for System Automation - 01 Advanced File ProcessingDanairat Thanabodithammachari
 
Tajo Seoul Meetup-201501
Tajo Seoul Meetup-201501Tajo Seoul Meetup-201501
Tajo Seoul Meetup-201501Jinho Kim
 
Hypertable - massively scalable nosql database
Hypertable - massively scalable nosql databaseHypertable - massively scalable nosql database
Hypertable - massively scalable nosql databasebigdatagurus_meetup
 
Hypertable
HypertableHypertable
Hypertablebetaisao
 
Database Architectures and Hypertable
Database Architectures and HypertableDatabase Architectures and Hypertable
Database Architectures and Hypertablehypertable
 
Hdfs connector api
Hdfs connector apiHdfs connector api
Hdfs connector apiThang Loi
 
Course 102: Lecture 3: Basic Concepts And Commands
Course 102: Lecture 3: Basic Concepts And Commands Course 102: Lecture 3: Basic Concepts And Commands
Course 102: Lecture 3: Basic Concepts And Commands Ahmed El-Arabawy
 
Python mongo db-training-europython-2011
Python mongo db-training-europython-2011Python mongo db-training-europython-2011
Python mongo db-training-europython-2011Andreas Jung
 
Postgresql search demystified
Postgresql search demystifiedPostgresql search demystified
Postgresql search demystifiedjavier ramirez
 
Hadoop Interacting with HDFS
Hadoop Interacting with HDFSHadoop Interacting with HDFS
Hadoop Interacting with HDFSApache Apex
 
MongoDB 在盛大大数据量下的应用
MongoDB 在盛大大数据量下的应用MongoDB 在盛大大数据量下的应用
MongoDB 在盛大大数据量下的应用iammutex
 
Percona Live 2017 ­- Sharded cluster tutorial
Percona Live 2017 ­- Sharded cluster tutorialPercona Live 2017 ­- Sharded cluster tutorial
Percona Live 2017 ­- Sharded cluster tutorialAntonios Giannopoulos
 
How mysql handles ORDER BY, GROUP BY, and DISTINCT
How mysql handles ORDER BY, GROUP BY, and DISTINCTHow mysql handles ORDER BY, GROUP BY, and DISTINCT
How mysql handles ORDER BY, GROUP BY, and DISTINCTSergey Petrunya
 
2015 bioinformatics python_io_wim_vancriekinge
2015 bioinformatics python_io_wim_vancriekinge2015 bioinformatics python_io_wim_vancriekinge
2015 bioinformatics python_io_wim_vancriekingeProf. Wim Van Criekinge
 
Fuse'ing python for rapid development of storage efficient FS
Fuse'ing python for rapid development of storage efficient FSFuse'ing python for rapid development of storage efficient FS
Fuse'ing python for rapid development of storage efficient FSChetan Giridhar
 
MongoDB & Hadoop: Flexible Hourly Batch Processing Model
MongoDB & Hadoop: Flexible Hourly Batch Processing ModelMongoDB & Hadoop: Flexible Hourly Batch Processing Model
MongoDB & Hadoop: Flexible Hourly Batch Processing ModelTakahiro Inoue
 
Полнотекстовый поиск в PostgreSQL за миллисекунды (Олег Бартунов, Александр К...
Полнотекстовый поиск в PostgreSQL за миллисекунды (Олег Бартунов, Александр К...Полнотекстовый поиск в PostgreSQL за миллисекунды (Олег Бартунов, Александр К...
Полнотекстовый поиск в PostgreSQL за миллисекунды (Олег Бартунов, Александр К...Ontico
 

What's hot (20)

Perl for System Automation - 01 Advanced File Processing
Perl for System Automation - 01 Advanced File ProcessingPerl for System Automation - 01 Advanced File Processing
Perl for System Automation - 01 Advanced File Processing
 
Unix Basics Commands
Unix Basics CommandsUnix Basics Commands
Unix Basics Commands
 
Tajo Seoul Meetup-201501
Tajo Seoul Meetup-201501Tajo Seoul Meetup-201501
Tajo Seoul Meetup-201501
 
Hypertable - massively scalable nosql database
Hypertable - massively scalable nosql databaseHypertable - massively scalable nosql database
Hypertable - massively scalable nosql database
 
Hypertable
HypertableHypertable
Hypertable
 
Database Architectures and Hypertable
Database Architectures and HypertableDatabase Architectures and Hypertable
Database Architectures and Hypertable
 
Hdfs connector api
Hdfs connector apiHdfs connector api
Hdfs connector api
 
Course 102: Lecture 3: Basic Concepts And Commands
Course 102: Lecture 3: Basic Concepts And Commands Course 102: Lecture 3: Basic Concepts And Commands
Course 102: Lecture 3: Basic Concepts And Commands
 
Unix Basics For Testers
Unix Basics For TestersUnix Basics For Testers
Unix Basics For Testers
 
Python mongo db-training-europython-2011
Python mongo db-training-europython-2011Python mongo db-training-europython-2011
Python mongo db-training-europython-2011
 
Postgresql search demystified
Postgresql search demystifiedPostgresql search demystified
Postgresql search demystified
 
Hadoop Interacting with HDFS
Hadoop Interacting with HDFSHadoop Interacting with HDFS
Hadoop Interacting with HDFS
 
MongoDB 在盛大大数据量下的应用
MongoDB 在盛大大数据量下的应用MongoDB 在盛大大数据量下的应用
MongoDB 在盛大大数据量下的应用
 
Percona Live 2017 ­- Sharded cluster tutorial
Percona Live 2017 ­- Sharded cluster tutorialPercona Live 2017 ­- Sharded cluster tutorial
Percona Live 2017 ­- Sharded cluster tutorial
 
How mysql handles ORDER BY, GROUP BY, and DISTINCT
How mysql handles ORDER BY, GROUP BY, and DISTINCTHow mysql handles ORDER BY, GROUP BY, and DISTINCT
How mysql handles ORDER BY, GROUP BY, and DISTINCT
 
2015 bioinformatics python_io_wim_vancriekinge
2015 bioinformatics python_io_wim_vancriekinge2015 bioinformatics python_io_wim_vancriekinge
2015 bioinformatics python_io_wim_vancriekinge
 
Fuse'ing python for rapid development of storage efficient FS
Fuse'ing python for rapid development of storage efficient FSFuse'ing python for rapid development of storage efficient FS
Fuse'ing python for rapid development of storage efficient FS
 
HDFS_Command_Reference
HDFS_Command_ReferenceHDFS_Command_Reference
HDFS_Command_Reference
 
MongoDB & Hadoop: Flexible Hourly Batch Processing Model
MongoDB & Hadoop: Flexible Hourly Batch Processing ModelMongoDB & Hadoop: Flexible Hourly Batch Processing Model
MongoDB & Hadoop: Flexible Hourly Batch Processing Model
 
Полнотекстовый поиск в PostgreSQL за миллисекунды (Олег Бартунов, Александр К...
Полнотекстовый поиск в PostgreSQL за миллисекунды (Олег Бартунов, Александр К...Полнотекстовый поиск в PostgreSQL за миллисекунды (Олег Бартунов, Александр К...
Полнотекстовый поиск в PostgreSQL за миллисекунды (Олег Бартунов, Александр К...
 

Similar to 第2回 Hadoop 輪読会

WhatsNewNIO2.pdf
WhatsNewNIO2.pdfWhatsNewNIO2.pdf
WhatsNewNIO2.pdfMohit Kumar
 
Big data using Hadoop, Hive, Sqoop with Installation
Big data using Hadoop, Hive, Sqoop with InstallationBig data using Hadoop, Hive, Sqoop with Installation
Big data using Hadoop, Hive, Sqoop with Installationmellempudilavanya999
 
Accessing external hadoop data sources using pivotal e xtension framework (px...
Accessing external hadoop data sources using pivotal e xtension framework (px...Accessing external hadoop data sources using pivotal e xtension framework (px...
Accessing external hadoop data sources using pivotal e xtension framework (px...Sameer Tiwari
 
5. Ввод-вывод, доступ к файловой системе
5. Ввод-вывод, доступ к файловой системе5. Ввод-вывод, доступ к файловой системе
5. Ввод-вывод, доступ к файловой системеDEVTYPE
 
RESTful Web Services with Jersey
RESTful Web Services with JerseyRESTful Web Services with Jersey
RESTful Web Services with JerseyScott Leberknight
 
Java 7 - short intro to NIO.2
Java 7 - short intro to NIO.2Java 7 - short intro to NIO.2
Java 7 - short intro to NIO.2Martijn Verburg
 
Big data, just an introduction to Hadoop and Scripting Languages
Big data, just an introduction to Hadoop and Scripting LanguagesBig data, just an introduction to Hadoop and Scripting Languages
Big data, just an introduction to Hadoop and Scripting LanguagesCorley S.r.l.
 
Javase7 1641812
Javase7 1641812Javase7 1641812
Javase7 1641812Vinay H G
 
HBaseCon 2012 | HBase Coprocessors – Deploy Shared Functionality Directly on ...
HBaseCon 2012 | HBase Coprocessors – Deploy Shared Functionality Directly on ...HBaseCon 2012 | HBase Coprocessors – Deploy Shared Functionality Directly on ...
HBaseCon 2012 | HBase Coprocessors – Deploy Shared Functionality Directly on ...Cloudera, Inc.
 
Node.js - A practical introduction (v2)
Node.js  - A practical introduction (v2)Node.js  - A practical introduction (v2)
Node.js - A practical introduction (v2)Felix Geisendörfer
 
Leveraging Hadoop in your PostgreSQL Environment
Leveraging Hadoop in your PostgreSQL EnvironmentLeveraging Hadoop in your PostgreSQL Environment
Leveraging Hadoop in your PostgreSQL EnvironmentJim Mlodgenski
 
Oscon Java Testing on the Fast Lane
Oscon Java Testing on the Fast LaneOscon Java Testing on the Fast Lane
Oscon Java Testing on the Fast LaneAndres Almiray
 
Nov. 4, 2011 o reilly webcast-hbase- lars george
Nov. 4, 2011 o reilly webcast-hbase- lars georgeNov. 4, 2011 o reilly webcast-hbase- lars george
Nov. 4, 2011 o reilly webcast-hbase- lars georgeO'Reilly Media
 
Gsummit apis-2012
Gsummit apis-2012Gsummit apis-2012
Gsummit apis-2012Gluster.org
 
Gsummit apis-2013
Gsummit apis-2013Gsummit apis-2013
Gsummit apis-2013Gluster.org
 
Building Restful Web Services with Java
Building Restful Web Services with JavaBuilding Restful Web Services with Java
Building Restful Web Services with JavaVassil Popovski
 
Cosmos, Big Data GE implementation in FIWARE
Cosmos, Big Data GE implementation in FIWARECosmos, Big Data GE implementation in FIWARE
Cosmos, Big Data GE implementation in FIWAREFernando Lopez Aguilar
 

Similar to 第2回 Hadoop 輪読会 (20)

WhatsNewNIO2.pdf
WhatsNewNIO2.pdfWhatsNewNIO2.pdf
WhatsNewNIO2.pdf
 
Big data using Hadoop, Hive, Sqoop with Installation
Big data using Hadoop, Hive, Sqoop with InstallationBig data using Hadoop, Hive, Sqoop with Installation
Big data using Hadoop, Hive, Sqoop with Installation
 
Accessing external hadoop data sources using pivotal e xtension framework (px...
Accessing external hadoop data sources using pivotal e xtension framework (px...Accessing external hadoop data sources using pivotal e xtension framework (px...
Accessing external hadoop data sources using pivotal e xtension framework (px...
 
Hadoop HDFS
Hadoop HDFS Hadoop HDFS
Hadoop HDFS
 
5. Ввод-вывод, доступ к файловой системе
5. Ввод-вывод, доступ к файловой системе5. Ввод-вывод, доступ к файловой системе
5. Ввод-вывод, доступ к файловой системе
 
RESTful Web Services with Jersey
RESTful Web Services with JerseyRESTful Web Services with Jersey
RESTful Web Services with Jersey
 
Java 7 - short intro to NIO.2
Java 7 - short intro to NIO.2Java 7 - short intro to NIO.2
Java 7 - short intro to NIO.2
 
Big data, just an introduction to Hadoop and Scripting Languages
Big data, just an introduction to Hadoop and Scripting LanguagesBig data, just an introduction to Hadoop and Scripting Languages
Big data, just an introduction to Hadoop and Scripting Languages
 
Javase7 1641812
Javase7 1641812Javase7 1641812
Javase7 1641812
 
Jug java7
Jug java7Jug java7
Jug java7
 
PyFilesystem
PyFilesystemPyFilesystem
PyFilesystem
 
HBaseCon 2012 | HBase Coprocessors – Deploy Shared Functionality Directly on ...
HBaseCon 2012 | HBase Coprocessors – Deploy Shared Functionality Directly on ...HBaseCon 2012 | HBase Coprocessors – Deploy Shared Functionality Directly on ...
HBaseCon 2012 | HBase Coprocessors – Deploy Shared Functionality Directly on ...
 
Node.js - A practical introduction (v2)
Node.js  - A practical introduction (v2)Node.js  - A practical introduction (v2)
Node.js - A practical introduction (v2)
 
Leveraging Hadoop in your PostgreSQL Environment
Leveraging Hadoop in your PostgreSQL EnvironmentLeveraging Hadoop in your PostgreSQL Environment
Leveraging Hadoop in your PostgreSQL Environment
 
Oscon Java Testing on the Fast Lane
Oscon Java Testing on the Fast LaneOscon Java Testing on the Fast Lane
Oscon Java Testing on the Fast Lane
 
Nov. 4, 2011 o reilly webcast-hbase- lars george
Nov. 4, 2011 o reilly webcast-hbase- lars georgeNov. 4, 2011 o reilly webcast-hbase- lars george
Nov. 4, 2011 o reilly webcast-hbase- lars george
 
Gsummit apis-2012
Gsummit apis-2012Gsummit apis-2012
Gsummit apis-2012
 
Gsummit apis-2013
Gsummit apis-2013Gsummit apis-2013
Gsummit apis-2013
 
Building Restful Web Services with Java
Building Restful Web Services with JavaBuilding Restful Web Services with Java
Building Restful Web Services with Java
 
Cosmos, Big Data GE implementation in FIWARE
Cosmos, Big Data GE implementation in FIWARECosmos, Big Data GE implementation in FIWARE
Cosmos, Big Data GE implementation in FIWARE
 

More from Toshihiro Suzuki

Apache HBaseの現在 - 火山と呼ばれたHBaseは今どうなっているのか
Apache HBaseの現在 - 火山と呼ばれたHBaseは今どうなっているのかApache HBaseの現在 - 火山と呼ばれたHBaseは今どうなっているのか
Apache HBaseの現在 - 火山と呼ばれたHBaseは今どうなっているのかToshihiro Suzuki
 
第25回 Hadoopソースコードリーディング 「HBase 最新情報」
第25回 Hadoopソースコードリーディング 「HBase 最新情報」第25回 Hadoopソースコードリーディング 「HBase 最新情報」
第25回 Hadoopソースコードリーディング 「HBase 最新情報」Toshihiro Suzuki
 
HDP ハンズオンセミナー
HDP ハンズオンセミナーHDP ハンズオンセミナー
HDP ハンズオンセミナーToshihiro Suzuki
 
Kuduを調べてみた #dogenzakalt
Kuduを調べてみた #dogenzakaltKuduを調べてみた #dogenzakalt
Kuduを調べてみた #dogenzakaltToshihiro Suzuki
 
HBaseを用いたグラフDB「Hornet」の設計と運用
HBaseを用いたグラフDB「Hornet」の設計と運用HBaseを用いたグラフDB「Hornet」の設計と運用
HBaseを用いたグラフDB「Hornet」の設計と運用Toshihiro Suzuki
 
HBaseを用いたグラフDB「Hornet」
HBaseを用いたグラフDB「Hornet」HBaseを用いたグラフDB「Hornet」
HBaseを用いたグラフDB「Hornet」Toshihiro Suzuki
 
HBaseでグラフ構造を扱う(開発中)
HBaseでグラフ構造を扱う(開発中)HBaseでグラフ構造を扱う(開発中)
HBaseでグラフ構造を扱う(開発中)Toshihiro Suzuki
 
Amebaサービスのログ解析基盤
Amebaサービスのログ解析基盤Amebaサービスのログ解析基盤
Amebaサービスのログ解析基盤Toshihiro Suzuki
 
MySQLによってタフになる会12章
MySQLによってタフになる会12章MySQLによってタフになる会12章
MySQLによってタフになる会12章Toshihiro Suzuki
 

More from Toshihiro Suzuki (10)

Apache HBaseの現在 - 火山と呼ばれたHBaseは今どうなっているのか
Apache HBaseの現在 - 火山と呼ばれたHBaseは今どうなっているのかApache HBaseの現在 - 火山と呼ばれたHBaseは今どうなっているのか
Apache HBaseの現在 - 火山と呼ばれたHBaseは今どうなっているのか
 
第25回 Hadoopソースコードリーディング 「HBase 最新情報」
第25回 Hadoopソースコードリーディング 「HBase 最新情報」第25回 Hadoopソースコードリーディング 「HBase 最新情報」
第25回 Hadoopソースコードリーディング 「HBase 最新情報」
 
HDP ハンズオンセミナー
HDP ハンズオンセミナーHDP ハンズオンセミナー
HDP ハンズオンセミナー
 
Kuduを調べてみた #dogenzakalt
Kuduを調べてみた #dogenzakaltKuduを調べてみた #dogenzakalt
Kuduを調べてみた #dogenzakalt
 
HBaseを用いたグラフDB「Hornet」の設計と運用
HBaseを用いたグラフDB「Hornet」の設計と運用HBaseを用いたグラフDB「Hornet」の設計と運用
HBaseを用いたグラフDB「Hornet」の設計と運用
 
HBase at Ameba
HBase at AmebaHBase at Ameba
HBase at Ameba
 
HBaseを用いたグラフDB「Hornet」
HBaseを用いたグラフDB「Hornet」HBaseを用いたグラフDB「Hornet」
HBaseを用いたグラフDB「Hornet」
 
HBaseでグラフ構造を扱う(開発中)
HBaseでグラフ構造を扱う(開発中)HBaseでグラフ構造を扱う(開発中)
HBaseでグラフ構造を扱う(開発中)
 
Amebaサービスのログ解析基盤
Amebaサービスのログ解析基盤Amebaサービスのログ解析基盤
Amebaサービスのログ解析基盤
 
MySQLによってタフになる会12章
MySQLによってタフになる会12章MySQLによってタフになる会12章
MySQLによってタフになる会12章
 

Recently uploaded

Unblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesUnblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesSinan KOZAK
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreternaman860154
 
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersEnhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersThousandEyes
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024The Digital Insurer
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking MenDelhi Call girls
 
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | DelhiFULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhisoniya singh
 
Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Paola De la Torre
 
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 3652toLead Limited
 
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...gurkirankumar98700
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonetsnaman860154
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationRadu Cotescu
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024Scott Keck-Warren
 
Swan(sea) Song – personal research during my six years at Swansea ... and bey...
Swan(sea) Song – personal research during my six years at Swansea ... and bey...Swan(sea) Song – personal research during my six years at Swansea ... and bey...
Swan(sea) Song – personal research during my six years at Swansea ... and bey...Alan Dix
 
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024BookNet Canada
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityPrincipled Technologies
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Drew Madelung
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Allon Mureinik
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Igalia
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfEnterprise Knowledge
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slidevu2urc
 

Recently uploaded (20)

Unblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen FramesUnblocking The Main Thread Solving ANRs and Frozen Frames
Unblocking The Main Thread Solving ANRs and Frozen Frames
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreter
 
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for PartnersEnhancing Worker Digital Experience: A Hands-on Workshop for Partners
Enhancing Worker Digital Experience: A Hands-on Workshop for Partners
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men
 
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | DelhiFULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
FULL ENJOY 🔝 8264348440 🔝 Call Girls in Diplomatic Enclave | Delhi
 
Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101Salesforce Community Group Quito, Salesforce 101
Salesforce Community Group Quito, Salesforce 101
 
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
Tech-Forward - Achieving Business Readiness For Copilot in Microsoft 365
 
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
Kalyanpur ) Call Girls in Lucknow Finest Escorts Service 🍸 8923113531 🎰 Avail...
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonets
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024SQL Database Design For Developers at php[tek] 2024
SQL Database Design For Developers at php[tek] 2024
 
Swan(sea) Song – personal research during my six years at Swansea ... and bey...
Swan(sea) Song – personal research during my six years at Swansea ... and bey...Swan(sea) Song – personal research during my six years at Swansea ... and bey...
Swan(sea) Song – personal research during my six years at Swansea ... and bey...
 
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
#StandardsGoals for 2024: What’s new for BISAC - Tech Forum 2024
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivity
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)Injustice - Developers Among Us (SciFiDevCon 2024)
Injustice - Developers Among Us (SciFiDevCon 2024)
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
 

第2回 Hadoop 輪読会

  • 1. Hadoop 3 Hadoop
  • 2. Hadoop • - - - • HDFS(Hadoop Distributed Filesystem)
  • 3. HDFS • - ‣ MB, GB, TB - ‣ - ‣ ‣
  • 4. HDFS • - ‣ - ‣ - ‣ ‣
  • 5. HDFS • - 64MB - ‣ ‣ ‣
  • 6. HDFS • - / - ( ) - ( )
  • 7. HDFS • - ( ) - ( )
  • 8. HDFS • - - -
  • 9. HDFS • - ( , ) -
  • 10. HDFS • - NameNode SecondaryNameNode
  • 11. HDFS • - open() append() write() NameNode SecondaryNameNode
  • 12. HDFS • - open() append() write() NameNode SecondaryNameNode
  • 13. HDFS • - open() append() write() NameNode SecondaryNameNode
  • 14. HDFS • - NameNode SecondaryNameNode
  • 15. HDFS • - NameNode SecondaryNameNode
  • 16. HDFS • - NameNode SecondaryNameNode
  • 17. HDFS • - NameNode SecondaryNameNode
  • 18. hadoop fs -copyFromLocal <localsrc> ... <dst> • hadoop fs -copyToLocal <src> <localdst> • hadoop fs -ls <path> • hadoop fs -mkdir <path> • hadoop fs -help
  • 19. Hadoop •hadoop fs -ls file:/// •hadoop fs -ls hdfs:/// •hadoop fs -ls hftp:/// URI java local file org.apache.hadoop.fs.localFileSystem HDFS hdfs org.apache.hadoop.hdfs.DistributesFileSystem HFTP hftp org.apache.hadoop.hdfs.HftpFileSystem HSFTP hsftp org.apache.hadoop.hdfs.HsftpFileSystem HAR har org.apache.hadoop.fs.HarFileSystem KFS kfs org.apache.hadoop.fs.kfs.KosmosFileSystem FTP ftp org.apache.hadoop.fs.ftp.FTPFileSystem S3 s3n org.apache.hadoop.fs.s3native.NativeS3FileSystem ( ) S3 s3 org.apache.hadoop.fs.S3FileSystem ( )
  • 20. Thrift • C - libhdfs • FUSE(FileSystem in Userspace) • WebDAV • - HTTP, FTP( )
  • 21. Java • Hadoop URL public class URLCat { static { URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()); } public static void main(String[] args) throws Exception { InputStream in = null; try { in = new URL(args[0]).openStream(); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } } }
  • 22. Java • FileSystem API public class FileSystemCat { public static void main(String[] args) throws Exception { String uri = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); InputStream in = null; try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } }
  • 23. Java • FSDataInputStream public class FSDataInputStream extends DataInputStream implements Seekable, PositionedReadable { // } public interface Seekable { void seek(long pos) throws IOException; long getPos() throws IOException; boolean seekToNewSource(long targetPos) throws IOException; }
  • 24. Java • FSDataInputStream public class FileSystemDoubleCat { public static void main(String[] args) throws Exception { String uri = args[0]; FileSystem fs = FileSystem.get(URI.create(uri), new Configuration()); FSDataInputStream in = null; try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); in.seek(0); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } } }
  • 25. Java • FSDataInputStream public class FSDataInputStream extends DataInputStream implements Seekable, PositionedReadable { // } public interface PositionedReadable { int read(long position, byte buffer[], int offset, int length) throws IOException; void readFully(long position, byte buffer[], int offset, int length) throws IOException; void readFully(long position, byte buffer[]) throws IOException; }
  • 26. Java • - public FSDataOutputStream create(Path f) throws IOException - public FSDataOutputStream append(Path f) throws IOException
  • 27. Java • FSDateOutputStream - FileSystem create(), append() - public class FSDataOutputStream extends DataOutputStream implements Syncable { public long getPos() throws IOException { // } // }
  • 28. Java • - public boolean mkdirs(Path f) throws IOException
  • 29. Java • FileStatus status = fs.getFileStatus(new Path("hdfs://localhost/hogehoge")); status.isDir(); // status.getLen(); // status.getModificationTime(); // status.getReplication(); // status.getBlockSize(); // ( 64MB) status.getOwner(); // status.getGroup(); // status.getPermission().toString(); //
  • 30. Java • - public FileStatus[] listStatus(Path f) throws IOException; - public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException; - public FileStatus[] listStatus(Path[] files) throws IOException; - public FileStatus[] listStatus(Path[] files, PathFilter filter) throws IOException;
  • 31. Java • public class ListStatus { public static void main(String[] args) throws Exception { String uri = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path[] paths = new Path[args.length]; for (int i = 0; i < paths.length; i++) { paths[i] = new Path(args[i]); } FileStatus[] status = fs.listStatus(paths); for (FileStatus stat : status) { System.out.println(stat.getPath().toUri().getPath()); } } }
  • 32. Java • - public FileStatus[] globStatus(Path pathPattern) throws IOException - public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException
  • 33. Java • [ab] {a,b} [^ab] {a,b} {a,b} (a b ) [a-b] a b {a,b} (a b ) a b [^a-b] {a,b} a b ¥c c c
  • 34. Java • public interface PathFilter { boolean accept(Path path); }
  • 35. Java • public class RegexExcludePathFilter implements PathFilter { private final String regex; public RegexExcludePathFilter(String regex) { this.regex = regex; } @Override public boolean accept(Path path) { return !path.toString().matches(regex); } } fs.globStatus(new Path("/2007/*/*"), new RegexExcludePathFilter("^.*/2007/12/31$"));
  • 36. Java • - public boolean delete(Path f, boolean recursive) throws IOException;
  • 37. • HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 38. open(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 39. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 40. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 41. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 42. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 43. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 44. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 45. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 46. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 47. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 48. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 49. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 close() block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 50. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 51. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 52. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 53. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 54. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 55. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 56. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 57. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 58. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 59. “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  • 60. - - -
  • 61. - - 9.1.1 - (/d1/r1/n1, /d1/r1/n1) = 0 d1 d2 (/d1/r1/n1, /d1/r1/n2) = 2 (/d1/r1/n1, /d1/r2/n3) = 4 r1 r2 r3 (/d1/r1/n1, /d2/r3/n4) = 6 n1 n2 n3 n4
  • 62. • HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 63. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 64. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 65. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 66. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DateNode1 DateNode2 DateNode3
  • 67. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  • 68. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  • 69. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  • 70. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  • 71. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  • 72. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3
  • 73. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3
  • 74. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1
  • 75. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  • 76. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 77. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 78. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 79. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 80. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 81. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 82. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() close() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 83. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() close() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  • 84. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  • 85. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1
  • 86. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  • 87. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  • 88. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  • 89. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  • 90. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  • 91. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  • 92. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 93. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 94. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 95. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 96. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  • 97. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2
  • 98. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2
  • 99. create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2 block2
  • 100. - dfs.replication.min( 1) - (dfs.replication 3) -
  • 101. 1. ( ) 2. 3. 4. ( )
  • 102. - fs.create(new Path("p")); - OutputStream out = fs.create(new Path("p")); out.write("content".getBytes("UTF-8")); out.flush();
  • 103. - FSDataOutputStream sync() - sync() close() FSDataOutputStream out = fs.create(new Path("p")); out.write("content".getBytes("UTF-8")); out.flush(); out.sync();
  • 104. - ‣ sync() ‣ sync() ‣ sync()
  • 105. distcp • 2 HDFS - hadoop distcp hdfs://namenode1/foo hdfs://namenode2/bar - hadoop distcp -overwrite hdfs://namenode1/foo hdfs://namenode2/bar/foo - hadoop distcp -update hdfs://namenode1/foo hdfs://namenode2/bar/foo • MapReduce - 256MB (1GB 4 ) - map ( ) - map 1 (tasktraker) 20map
  • 106. Hadoop • • HAR • hadoop archive -archiveName files.har /my/files /my
  • 107. Hadoop • - ( ) - - HAR MapReduce ( 7.2.1.4 CombineFileInputFormat )
  • 108. HDFS - - - - • distcp • HAR