Hadoop
3    Hadoop
Hadoop

•
    -
    -
    -

•   HDFS(Hadoop Distributed Filesystem)
HDFS

•
    -
        ‣   MB, GB, TB

    -
        ‣
    -
        ‣
        ‣
HDFS

•
    -
        ‣
    -
        ‣
    -
        ‣
        ‣
HDFS

•
    -        64MB




    -
        ‣
        ‣
        ‣
HDFS

•
    -   /

    -       (   )

    -       (   )
HDFS

•
    -
               (           )

    -              (
                       )
HDFS

•
    -
    -

    -
HDFS

•

    -          (   ,   )

    -
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -
 open()
append()
 write()
           NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
HDFS

•
    -


        NameNode   SecondaryNameNode
•   hadoop fs -copyFromLocal <localsrc> ... <dst>

•   hadoop fs -copyToLocal <src> <localdst>

•   hadoop fs -ls <path>

...
Hadoop

                        •hadoop fs -ls file:///
                        •hadoop fs -ls hdfs:///
                   ...
•   Thrift

•   C
    -   libhdfs




•   FUSE(FileSystem in Userspace)

•   WebDAV

•
    -   HTTP, FTP(           )
Java

    •   Hadoop URL

public class URLCat {
	 static {
	 	 URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactor...
Java

•   FileSystem API

    public class FileSystemCat {
    	 public static void main(String[] args) throws Exception {...
Java

•       FSDataInputStream
    public class FSDataInputStream extends DataInputStream
        implements Seekable, Po...
Java

  •   FSDataInputStream

public class FileSystemDoubleCat {
	 public static void main(String[] args) throws Exceptio...
Java

•       FSDataInputStream
    public class FSDataInputStream extends DataInputStream
        implements Seekable, Po...
Java

•
    -   public FSDataOutputStream create(Path f)
        throws IOException

    -   public FSDataOutputStream app...
Java

•   FSDateOutputStream
    -   FileSystem   create(), append()

    -

        public class FSDataOutputStream exten...
Java

•
    -   public boolean mkdirs(Path f) throws IOException
Java

  •
FileStatus status = fs.getFileStatus(new Path("hdfs://localhost/hogehoge"));

status.isDir(); //
status.getLen()...
Java

•
    -   public FileStatus[] listStatus(Path f) throws IOException;

    -   public FileStatus[] listStatus(Path f,...
Java

•
    public class ListStatus {
    	 public static void main(String[] args) throws Exception {
    	 	 String uri =...
Java

•
    -   public FileStatus[] globStatus(Path pathPattern) throws IOException

    -   public FileStatus[] globStatu...
Java

•


    [ab]                        {a,b}


    [^ab]                       {a,b}

                            {a,b}...
Java

•
    public interface PathFilter {
        boolean accept(Path path);
    }
Java

   •
         public class RegexExcludePathFilter implements PathFilter {

         	   private final String regex;
...
Java

•
    -   public boolean delete(Path f, boolean recursive)
        throws IOException;
•
HDFS       DistributedFileSystem                              NameNode




            FSDataInputStream




           ...
•
           open(new Path(“/aaa.txt”))
HDFS                                    DistributedFileSystem                     ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
                                                                “/aaa.txt”
           open(new Path(“/aaa.txt”))        ...
•
    -
    -
    -
•
    -
    -                  9.1.1

    -
                                    (/d1/r1/n1, /d1/r1/n1) = 0
               ...
•
HDFS       DistributedFileSystem                           NameNode




           FSDataOutputStream




              ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
           create(new Path(“/aaa.txt”))

HDFS                                      DistributedFileSystem                ...
•
    -                      dfs.replication.min(        1)



    -   (dfs.replication                          3)




  ...
•
    1.       (       )

    2.

    3.

    4.   (       )
•
    -

             fs.create(new Path("p"));




    -

        OutputStream out = fs.create(new Path("p"));
        ou...
•
    -   FSDataOutputStream sync()

    -   sync()   close()


                 FSDataOutputStream out = fs.create(new Pa...
•
    -
        ‣   sync()

        ‣            sync()

        ‣   sync()
distcp

•   2        HDFS

    -   hadoop distcp hdfs://namenode1/foo hdfs://namenode2/bar

    -   hadoop distcp -overwri...
Hadoop

•

•   HAR


•   hadoop archive -archiveName files.har /my/files /my
Hadoop

•
    -
                                             (
                )

    -
    -   HAR            MapReduce
 ...
•   HDFS
    -
    -
    -
    -


•   distcp

•   HAR
•

•
Upcoming SlideShare
Loading in …5
×

第2回 Hadoop 輪読会

2,332 views

Published on

第2回 Hadoop 輪読会の発表資料

Published in: Technology
0 Comments
3 Likes
Statistics
Notes
  • Be the first to comment

No Downloads
Views
Total views
2,332
On SlideShare
0
From Embeds
0
Number of Embeds
486
Actions
Shares
0
Downloads
58
Comments
0
Likes
3
Embeds 0
No embeds

No notes for slide

第2回 Hadoop 輪読会

  1. 1. Hadoop 3 Hadoop
  2. 2. Hadoop • - - - • HDFS(Hadoop Distributed Filesystem)
  3. 3. HDFS • - ‣ MB, GB, TB - ‣ - ‣ ‣
  4. 4. HDFS • - ‣ - ‣ - ‣ ‣
  5. 5. HDFS • - 64MB - ‣ ‣ ‣
  6. 6. HDFS • - / - ( ) - ( )
  7. 7. HDFS • - ( ) - ( )
  8. 8. HDFS • - - -
  9. 9. HDFS • - ( , ) -
  10. 10. HDFS • - NameNode SecondaryNameNode
  11. 11. HDFS • - open() append() write() NameNode SecondaryNameNode
  12. 12. HDFS • - open() append() write() NameNode SecondaryNameNode
  13. 13. HDFS • - open() append() write() NameNode SecondaryNameNode
  14. 14. HDFS • - NameNode SecondaryNameNode
  15. 15. HDFS • - NameNode SecondaryNameNode
  16. 16. HDFS • - NameNode SecondaryNameNode
  17. 17. HDFS • - NameNode SecondaryNameNode
  18. 18. • hadoop fs -copyFromLocal <localsrc> ... <dst> • hadoop fs -copyToLocal <src> <localdst> • hadoop fs -ls <path> • hadoop fs -mkdir <path> • hadoop fs -help
  19. 19. Hadoop •hadoop fs -ls file:/// •hadoop fs -ls hdfs:/// •hadoop fs -ls hftp:/// URI java local file org.apache.hadoop.fs.localFileSystem HDFS hdfs org.apache.hadoop.hdfs.DistributesFileSystem HFTP hftp org.apache.hadoop.hdfs.HftpFileSystem HSFTP hsftp org.apache.hadoop.hdfs.HsftpFileSystem HAR har org.apache.hadoop.fs.HarFileSystem KFS kfs org.apache.hadoop.fs.kfs.KosmosFileSystem FTP ftp org.apache.hadoop.fs.ftp.FTPFileSystem S3 s3n org.apache.hadoop.fs.s3native.NativeS3FileSystem ( ) S3 s3 org.apache.hadoop.fs.S3FileSystem ( )
  20. 20. • Thrift • C - libhdfs • FUSE(FileSystem in Userspace) • WebDAV • - HTTP, FTP( )
  21. 21. Java • Hadoop URL public class URLCat { static { URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()); } public static void main(String[] args) throws Exception { InputStream in = null; try { in = new URL(args[0]).openStream(); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } } }
  22. 22. Java • FileSystem API public class FileSystemCat { public static void main(String[] args) throws Exception { String uri = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); InputStream in = null; try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } }
  23. 23. Java • FSDataInputStream public class FSDataInputStream extends DataInputStream implements Seekable, PositionedReadable { // } public interface Seekable { void seek(long pos) throws IOException; long getPos() throws IOException; boolean seekToNewSource(long targetPos) throws IOException; }
  24. 24. Java • FSDataInputStream public class FileSystemDoubleCat { public static void main(String[] args) throws Exception { String uri = args[0]; FileSystem fs = FileSystem.get(URI.create(uri), new Configuration()); FSDataInputStream in = null; try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); in.seek(0); IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } } }
  25. 25. Java • FSDataInputStream public class FSDataInputStream extends DataInputStream implements Seekable, PositionedReadable { // } public interface PositionedReadable { int read(long position, byte buffer[], int offset, int length) throws IOException; void readFully(long position, byte buffer[], int offset, int length) throws IOException; void readFully(long position, byte buffer[]) throws IOException; }
  26. 26. Java • - public FSDataOutputStream create(Path f) throws IOException - public FSDataOutputStream append(Path f) throws IOException
  27. 27. Java • FSDateOutputStream - FileSystem create(), append() - public class FSDataOutputStream extends DataOutputStream implements Syncable { public long getPos() throws IOException { // } // }
  28. 28. Java • - public boolean mkdirs(Path f) throws IOException
  29. 29. Java • FileStatus status = fs.getFileStatus(new Path("hdfs://localhost/hogehoge")); status.isDir(); // status.getLen(); // status.getModificationTime(); // status.getReplication(); // status.getBlockSize(); // ( 64MB) status.getOwner(); // status.getGroup(); // status.getPermission().toString(); //
  30. 30. Java • - public FileStatus[] listStatus(Path f) throws IOException; - public FileStatus[] listStatus(Path f, PathFilter filter) throws IOException; - public FileStatus[] listStatus(Path[] files) throws IOException; - public FileStatus[] listStatus(Path[] files, PathFilter filter) throws IOException;
  31. 31. Java • public class ListStatus { public static void main(String[] args) throws Exception { String uri = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path[] paths = new Path[args.length]; for (int i = 0; i < paths.length; i++) { paths[i] = new Path(args[i]); } FileStatus[] status = fs.listStatus(paths); for (FileStatus stat : status) { System.out.println(stat.getPath().toUri().getPath()); } } }
  32. 32. Java • - public FileStatus[] globStatus(Path pathPattern) throws IOException - public FileStatus[] globStatus(Path pathPattern, PathFilter filter) throws IOException
  33. 33. Java • [ab] {a,b} [^ab] {a,b} {a,b} (a b ) [a-b] a b {a,b} (a b ) a b [^a-b] {a,b} a b ¥c c c
  34. 34. Java • public interface PathFilter { boolean accept(Path path); }
  35. 35. Java • public class RegexExcludePathFilter implements PathFilter { private final String regex; public RegexExcludePathFilter(String regex) { this.regex = regex; } @Override public boolean accept(Path path) { return !path.toString().matches(regex); } } fs.globStatus(new Path("/2007/*/*"), new RegexExcludePathFilter("^.*/2007/12/31$"));
  36. 36. Java • - public boolean delete(Path f, boolean recursive) throws IOException;
  37. 37. • HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  38. 38. • open(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  39. 39. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode FSDataInputStream DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  40. 40. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  41. 41. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  42. 42. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  43. 43. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  44. 44. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  45. 45. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  46. 46. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  47. 47. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  48. 48. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  49. 49. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 close() block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  50. 50. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  51. 51. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  52. 52. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  53. 53. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  54. 54. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  55. 55. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  56. 56. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  57. 57. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  58. 58. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  59. 59. • “/aaa.txt” open(new Path(“/aaa.txt”)) ( ) HDFS DistributedFileSystem NameNode block1 : DataNode1, DataNode3 read() block2 : DataNode3, DataNode4 aaa.txt : block1, block2. block3, block4 block1 : DataNode1, DataNode3 block2 : DataNode3, DataNode4 FSDataInputStream block3 : DataNode2, DataNode3 block4 : DataNode1, DataNode2 DateNode1 DateNode2 DateNode3 DateNode4 block1 block3 block1 block2 block4 block4 block2 block3
  60. 60. • - - -
  61. 61. • - - 9.1.1 - (/d1/r1/n1, /d1/r1/n1) = 0 d1 d2 (/d1/r1/n1, /d1/r1/n2) = 2 (/d1/r1/n1, /d1/r2/n3) = 4 r1 r2 r3 (/d1/r1/n1, /d2/r3/n4) = 6 n1 n2 n3 n4
  62. 62. • HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  63. 63. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  64. 64. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  65. 65. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode FSDataOutputStream DateNode1 DateNode2 DateNode3
  66. 66. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DateNode1 DateNode2 DateNode3
  67. 67. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  68. 68. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  69. 69. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream ack DateNode1 DateNode2 DateNode3
  70. 70. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  71. 71. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  72. 72. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3
  73. 73. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3
  74. 74. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1
  75. 75. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  76. 76. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  77. 77. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  78. 78. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  79. 79. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  80. 80. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  81. 81. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  82. 82. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() close() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  83. 83. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() close() FSDataOutputStream block1 : DataNode1, DataNode2, DataNode3 DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1 block1
  84. 84. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3
  85. 85. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1
  86. 86. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  87. 87. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  88. 88. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block1 block1
  89. 89. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  90. 90. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  91. 91. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1
  92. 92. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  93. 93. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  94. 94. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  95. 95. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  96. 96. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block1 block2
  97. 97. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2
  98. 98. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2
  99. 99. • create(new Path(“/aaa.txt”)) HDFS DistributedFileSystem NameNode write() FSDataOutputStream DataStreamer ack DateNode1 DateNode2 DateNode3 block2 block1 block2 block2
  100. 100. • - dfs.replication.min( 1) - (dfs.replication 3) -
  101. 101. • 1. ( ) 2. 3. 4. ( )
  102. 102. • - fs.create(new Path("p")); - OutputStream out = fs.create(new Path("p")); out.write("content".getBytes("UTF-8")); out.flush();
  103. 103. • - FSDataOutputStream sync() - sync() close() FSDataOutputStream out = fs.create(new Path("p")); out.write("content".getBytes("UTF-8")); out.flush(); out.sync();
  104. 104. • - ‣ sync() ‣ sync() ‣ sync()
  105. 105. distcp • 2 HDFS - hadoop distcp hdfs://namenode1/foo hdfs://namenode2/bar - hadoop distcp -overwrite hdfs://namenode1/foo hdfs://namenode2/bar/foo - hadoop distcp -update hdfs://namenode1/foo hdfs://namenode2/bar/foo • MapReduce - 256MB (1GB 4 ) - map ( ) - map 1 (tasktraker) 20map
  106. 106. Hadoop • • HAR • hadoop archive -archiveName files.har /my/files /my
  107. 107. Hadoop • - ( ) - - HAR MapReduce ( 7.2.1.4 CombineFileInputFormat )
  108. 108. • HDFS - - - - • distcp • HAR
  109. 109. • •

×