Your SlideShare is downloading. ×
Sysprog 16
Upcoming SlideShare
Loading in...5
×

Thanks for flagging this SlideShare!

Oops! An error has occurred.

×

Saving this for later?

Get the SlideShare app to save on your phone or tablet. Read anywhere, anytime - even offline.

Text the download link to your phone

Standard text messaging rates apply

Sysprog 16

2,162
views

Published on

Published in: Technology

0 Comments
0 Likes
Statistics
Notes
  • Be the first to comment

  • Be the first to like this

No Downloads
Views
Total Views
2,162
On Slideshare
0
From Embeds
0
Number of Embeds
1
Actions
Shares
0
Downloads
42
Comments
0
Likes
0
Embeds 0
No embeds

Report content
Flagged as inappropriate Flag as inappropriate
Flag as inappropriate

Select your reason for flagging this presentation as inappropriate.

Cancel
No notes for slide

Transcript

  • 1. C/C++ Linux System Programming
      • Session 16
      • User-space System Programming
      • – session 6
  • 2. Outline
    • Filesystem concepts
    • File I/O Ops
  • 3. Filesystem
    • Traditionally: An abstraction for storage device access
    • Why?
      • Common sensible organization
      • Encapsulate OS – HW interaction, e.g. performance considerations
  • 4. VFS
    • Wider-range abstraction:
      • special FS, different types of disk FS, network FS
      • Common user interface
      • Multiple FS's
      • Common handling
  • 5. Mounts
    • Superblocks – filesystem control block
    • Mount point
    • Syscalls
      • int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
      • int umount(const char *target);
  • 6. FS Objects and Metadata
    • Inode – file control block
      • A unique ID
      • Access/Owner info
      • Memory maps
      • Block device info
    • Dirent – file as a directory entry (not physical)
    • File – file data and hook to meta (not physical)
  • 7. Journaling
    • Problem:
      • operations on metadata are non-atomic, can be interrupted by power loss
    • Physical vs logical journals
    • Metadata-only journals
  • 8. Disk Cache
    • Buffers
    • Page cache
    • Writeback – pdflush
    • Read-ahead
  • 9. File Descriptors
    • Descriptors – index into process file table
    • int open(const char *pathname, int flags);
    • int open(const char *pathname, int flags, mode_t mode);
    • int creat(const char *pathname, mode_t mode);
      • Open with O_CREAT (disk files only)
    • int close(int fd); /* notice status !! */
  • 10. File I/O modes
    • int fcntl(int fd, int cmd, long arg); // F_SETFL
    • Nonblocking: If not ready, EAGAIN - O_NONBLOCK
    • Synchronized: Wait until data is on HW - O_SYNC
      • int fsync(int fd);
    • Asynchronous: Signal when ready - O_ASYNC
      • SIGIO handler
      • fcntl: F_GETSIG / F_SETSIG, F_SETOWN/F_GETOWN (process getting signal)
    • Direct: Directly from user buffer - O_DIRECT
  • 11. More File control
    • int unlink(const char *pathname);
    • int truncate(const char *path, off_t length);
      • int ftruncate(int fd, off_t length);
      • O_TRUNC on open
  • 12. Descriptor I/O
    • ssize_t read(int fd, void *buf, size_t count);
    • ssize_t write(int fd, const void *buf, size_t count);
    • off_t lseek(int fd, off_t offset, int whence);
      • SEEK_SET, SEEK_CUR, SEEK_END
    • EOF
  • 13. IO Vectors
    • ssize_t readv(int fd, const struct iovec *iov, int iovcnt);
    • ssize_t writev(int fd, const struct iovec *iov, int iovcnt);
    struct iovec { void *iov_base; /* Starting address */ size_t iov_len; /* Number of bytes to transfer */ };
  • 14. int echo_main(int argc, char **argv) { struct iovec io[argc]; struct iovec *cur_io = io; char *arg; char *p; ... while (1) { int c; cur_io->iov_base = p = arg; ... while ((c = *arg++)) { if (c == eflag) { /* Check for escape seq. */ if (*arg == 'c') { /* 'c' means cancel newline and ignore all subsequent chars. */ cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; goto ret; } ... c = bb_process_escape_sequence( (void*) &arg); } *p++ = c; } arg = *++argv; if (arg) *p++ = ' '; cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; if (!arg) break; } ret: return writev(1, io, (cur_io - io)) >= 0; }
  • 15. Memory Mapped file
    • void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
    • int munmap(void *start, size_t length);
    • Important flags:
      • No anonymous, MAP_SHARED, MAP_FIXED, MAP_POPULATE ( | MAP_NONBLOCK)
    • int msync(void *start, size_t length, int flags); // MS_SYNC or MS_ASYNC
    • void *mremap(void *old_address, size_t old_size, size_t new_size, int flags);
  • 16. Locking
    • Mandatory Locking (BSD)
      • ~S_IXGRP | SGID ( + mount flag MS_MANDLOCK)
      • Racy (mmap)
    • Advisory Locking
      • Both sides play nice
  • 17. Advisory Locking
    • int flock(int fd, int operation); // LOCK_SH, LOCK_EX, LOCK_UN
    • int lockf(int fd, int cmd, off_t len); // F_LOCK, T_LOCK, F_ULOCK, F_TEST
    • fcntl: F_GETLK, F_SETLK, F_SETLKW
      • High level of control (with offset, down to a single byte)
    struct flock { ... short l_type; /* Type of lock: F_RDLCK, F_WRLCK, F_UNLCK */ short l_whence; /* How to interpret l_start: SEEK_SET, SEEK_CUR, SEEK_END */ off_t l_start; /* Starting offset for lock */ off_t l_len; /* Number of bytes to lock */ pid_t l_pid; /* PID of process blocking our lock (F_GETLK only) */ ... };
  • 18. #ifdef F_SETLK #ifndef SEEK_SET #define SEEK_SET 0 #endif struct flock lock_data; lock_data.l_type = F_WRLCK; lock_data.l_whence = SEEK_SET; lock_data.l_start = lock_data.l_len = 0; if (fcntl(pidFd, F_SETLK, &lock_data) == -1) { if (errno == EAGAIN) return oldpid; else return -1; } #else #ifdef LOCK_EX if (flock (pidFd, LOCK_EX|LOCK_NB) == -1) { if (errno == EWOULDBLOCK) return oldpid; else return -1; } #else if (lockf (pidFd, F_TLOCK, 0) == -1) { if (errno == EACCES) return oldpid; else return -1; } #endif #endif }
  • 19. Buffered I/O
    • Streams: Buffer I/O and write to kernel at once
      • Better alignment
      • Less system calls
      • Yet another “cache”!!
      • FILE *
      • Formatting
    • FILE *fopen(const char *path, const char *mode);
    • FILE *fdopen(int fd, const char *mode);
    • int fclose(FILE *fp);
    • int fileno(FILE *stream);
  • 20. I/O
    • size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
    • size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
    • Formatted
      • int fprintf(FILE *stream, const char *format, ...);
      • int fscanf(FILE *stream, const char *format, ...);
    • Char
      • int fputc(int c, FILE *stream);
      • int fgetc(FILE *stream); -- int ungetc(int c, FILE *stream);
    • String
      • int fputs(const char *s, FILE *stream);
      • char *fgets(char *s, int size, FILE *stream);
  • 21. Behind the Scenes
    • Inherently thread-safe
    • To do your own locking (of the stream, not the file)
      • void flockfile(FILE *filehandle);
      • int ftrylockfile(FILE *filehandle);
      • void funlockfile(FILE *filehandle);
      • xxx_unlocked versions (e.g. fread_unlocked)
    • Flushing the stream (not the page cache)
      • int fflush(FILE *stream);
  • 22. Errors
    • int feof(FILE *stream);
    • int ferror(FILE *stream);
    • void clearerr(FILE *stream);
    • Descriptor ops can not distinguish EOF vs error
  • 23. Positioning
    • int fseek(FILE *stream, long offset, int whence);
    • long ftell(FILE *stream);
    • int fgetpos(FILE *stream, fpos_t *pos);
    • int fsetpos(FILE *stream, fpos_t *pos);
  • 24. Metadata
    • int fstat(int fd, struct stat *buf);
    • int stat(const char *path, struct stat *buf);
      • lstat : BSD only
      • Exec on all nodes in path
    struct stat { dev_t st_dev; /* ID of device containing file */ ino_t st_ino; /* inode number */ mode_t st_mode; /* protection */ nlink_t st_nlink; /* number of hard links */ uid_t st_uid; /* user ID of owner */ gid_t st_gid; /* group ID of owner */ dev_t st_rdev; /* device ID (if special file) */ off_t st_size; /* total size, in bytes */ blksize_t st_blksize; /* blocksize for filesystem I/O */ blkcnt_t st_blocks; /* number of blocks allocated */ time_t st_atime; /* time of last access */ time_t st_mtime; /* time of last modification */ time_t st_ctime; /* time of last status change */ };
  • 25. Directory Streams
    • A directory is a file whose entries are other inodes
    • DIR *opendir(const char *name);
    • int closedir(DIR *dir);
    • struct dirent *readdir(DIR *dir);
    struct dirent { ino_t d_ino; /* inode number */ off_t d_off; /* offset to the next dirent */ unsigned short d_reclen; /* length of this record */ unsigned char d_type; /* type of file */ char d_name[256]; /* filename */ };
  • 26. static pid_list *scan_proc_pids(inode_list *ilist) { DIR *d; struct dirent *de; pid_t pid; pid_list *plist; xchdir(&quot;/proc&quot;); d = opendir(&quot;/proc&quot;); if (!d) return NULL; plist = NULL; while ((de = readdir(d)) != NULL) { pid = (pid_t)bb_strtou(de->d_name, NULL, 10); if (errno) continue; if (chdir(de->d_name) < 0) continue; plist = scan_link(&quot;cwd&quot;, pid, ilist, plist); plist = scan_link(&quot;exe&quot;, pid, ilist, plist); plist = scan_link(&quot;root&quot;, pid, ilist, plist); .... } closedir(d); return plist; } static pid_list *scan_link(const char *lname, pid_t pid, inode_list *ilist, pid_list *plist) { ino_t inode; dev_t dev; if (!file_to_dev_inode(lname, &dev, &inode)) return plist; if (search_dev_inode(ilist, dev, inode)) plist = add_pid(plist, pid); return plist; } static int file_to_dev_inode(const char *filename, dev_t *dev, ino_t *inode) { struct stat f_stat; if (stat(filename, &f_stat)) return 0; *inode = f_stat.st_ino; *dev = f_stat.st_dev; return 1; } static int search_dev_inode(inode_list *ilist, dev_t dev, ino_t inode) { while (ilist) { if (ilist->dev == dev) { if (option_mask32 & OPT_MOUNT) return 1; if (ilist->inode == inode) return 1; } ilist = ilist->next; } return 0; }
  • 27. I/O Multiplexing
    • int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
    • int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
    • int poll(struct pollfd *fds, nfds_t nfds, int timeout);
    • int ppoll(struct pollfd *fds, nfds_vt nfds, const struct timespec *timeout, const sigset_t *sigmask);
      • POLLIN/POLLOUT/POLLPRI/POLLERR
    void FD_CLR(int fd, fd_set *set); int FD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); struct pollfd { int fd; /* file descriptor */ short events; /* requested events */ short revents; /* returned events */ };
  • 28. Epoll
    • Decouple interest set registration from poll
      • +: O(1) on the wait
      • +: Edge trigger
      • - : system call for adding onto the set
    • int epoll_create(int size); //desc, need close
    • int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
    • int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
    typedef union epoll_data { void *ptr; int fd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_t events; /* Epoll events */ epoll_data_t data; /* User data variable */ };
  • 29.  
  • 30. IOCTL
    • Device / special file control
    • int ioctl(int d, int request, ...);
    • Request is specific to device being controlled, and may have a payload (ioctl_list)
  • 31. Filesystem events
    • int inotify_init(void); // desc, need close
    • int inotify_add_watch(int fd, const char *pathname, uint32_t mask); // watch desc
    • int inotify_rm_watch(int fd, uint32_t wd);
    • FIONREAD ioctl
    • fcntl: F_NOTIFY
    struct inotify_event { int wd; /* watch descriptor */ uint32_t mask; /* mask of events */ uint32_t cookie; /* unique cookie */ uint32_t len; /* size of 'name' field */ char name[]; /* null-terminated name */ };
  • 32. int inotifyd_main(int argc UNUSED_PARAM, char **argv) { unsigned mask = IN_ALL_EVENTS; // assume we want all events struct pollfd pfd; char **watched = ++argv; // watched name list const char *args[] = { *argv, NULL, NULL, NULL, NULL }; // open inotify pfd.fd = inotify_init(); if (pfd.fd < 0) bb_perror_msg_and_die(&quot;no kernel support&quot;); // setup watched while (*++argv) { char *path = *argv; char *masks = strchr(path, ':'); int wd; // watch descriptor // if mask is specified -> if (masks) { *masks = ''; // split path and mask // convert mask names to mask bitset mask = 0; while (*++masks) { int i = strchr(mask_names, *masks) - mask_names; if (i >= 0) { mask |= (1 << i); } } } // add watch wd = inotify_add_watch(pfd.fd, path, mask); if (wd < 0) { bb_perror_msg_and_die(&quot;add watch (%s) failed&quot;, path); } } static const char mask_names[] ALIGN1 = &quot;a&quot; // 0x00000001 File was accessed &quot;c&quot; // 0x00000002 File was modified &quot;e&quot; // 0x00000004 Metadata changed &quot;w&quot; // 0x00000008 Writtable file was closed &quot;0&quot; // 0x00000010 Unwrittable file closed &quot;r&quot; // 0x00000020 File was opened &quot;m&quot; // 0x00000040 File was moved from X &quot;y&quot; // 0x00000080 File was moved to Y &quot;n&quot; // 0x00000100 Subfile was created &quot;d&quot; // 0x00000200 Subfile was deleted &quot;D&quot; // 0x00000400 Self was deleted &quot;M&quot; // 0x00000800 Self was moved ; pfd.events = POLLIN; while (!signalled && poll(&pfd, 1, -1) > 0) { ssize_t len; void *buf; struct inotify_event *ie; // read out all pending events xioctl(pfd.fd, FIONREAD, &len); #define eventbuf bb_common_bufsiz1 ie = buf = (len <= sizeof(eventbuf)) ? eventbuf : xmalloc(len); len = full_read(pfd.fd, buf, len); // process events. N.B. events may vary in length while (len > 0) { int i; char events[12]; char *s = events; unsigned m = ie->mask; for (i = 0; i < 12; ++i, m >>= 1) { if (m & 1) { *s++ = mask_names[i]; } } *s = ''; args[1] = events; args[2] = watched[ie->wd]; args[3] = ie->len ? ie->name : NULL; xspawn((char **)args); // next event i = sizeof(struct inotify_event) + ie->len; len -= i; ie = (void*)((char*)ie + i); } if (eventbuf != buf) free(buf); } return EXIT_SUCCESS; }
  • 33. Asynchronous I/O
    • Only on O_DIRECT
    struct aiocb { int aio_filedes; /* file descriptor * int aio_lio_opcode; /* operation to perform */ int aio_reqprio; /* request priority offset * volatile void *aio_buf; /* pointer to buffer */ size_t aio_nbytes; /* length of operation */ struct sigevent aio_sigevent; /* signal number and value */ /* internal, private members follow... */ }; int aio_read (struct aiocb *aiocbp); int aio_write (struct aiocb *aiocbp); int aio_error (const struct aiocb *aiocbp); int aio_return (struct aiocb *aiocbp); int aio_cancel (int fd, struct aiocb *aiocbp); int aio_fsync (int op, struct aiocb *aiocbp); int aio_suspend (const struct aiocb * const cblist[], int n, const struct timespec *timeout);