• Like
Sysprog 16
Upcoming SlideShare
Loading in...5
×

Sysprog 16

  • 2,145 views
Uploaded on

 

More in: Technology
  • Full Name Full Name Comment goes here.
    Are you sure you want to
    Your message goes here
    Be the first to comment
    Be the first to like this
No Downloads

Views

Total Views
2,145
On Slideshare
0
From Embeds
0
Number of Embeds
1

Actions

Shares
Downloads
42
Comments
0
Likes
0

Embeds 0

No embeds

Report content

Flagged as inappropriate Flag as inappropriate
Flag as inappropriate

Select your reason for flagging this presentation as inappropriate.

Cancel
    No notes for slide

Transcript

  • 1. C/C++ Linux System Programming
      • Session 16
      • User-space System Programming
      • – session 6
  • 2. Outline
    • Filesystem concepts
    • File I/O Ops
  • 3. Filesystem
    • Traditionally: An abstraction for storage device access
    • Why?
      • Common sensible organization
      • Encapsulate OS – HW interaction, e.g. performance considerations
  • 4. VFS
    • Wider-range abstraction:
      • special FS, different types of disk FS, network FS
      • Common user interface
      • Multiple FS's
      • Common handling
  • 5. Mounts
    • Superblocks – filesystem control block
    • Mount point
    • Syscalls
      • int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
      • int umount(const char *target);
  • 6. FS Objects and Metadata
    • Inode – file control block
      • A unique ID
      • Access/Owner info
      • Memory maps
      • Block device info
    • Dirent – file as a directory entry (not physical)
    • File – file data and hook to meta (not physical)
  • 7. Journaling
    • Problem:
      • operations on metadata are non-atomic, can be interrupted by power loss
    • Physical vs logical journals
    • Metadata-only journals
  • 8. Disk Cache
    • Buffers
    • Page cache
    • Writeback – pdflush
    • Read-ahead
  • 9. File Descriptors
    • Descriptors – index into process file table
    • int open(const char *pathname, int flags);
    • int open(const char *pathname, int flags, mode_t mode);
    • int creat(const char *pathname, mode_t mode);
      • Open with O_CREAT (disk files only)
    • int close(int fd); /* notice status !! */
  • 10. File I/O modes
    • int fcntl(int fd, int cmd, long arg); // F_SETFL
    • Nonblocking: If not ready, EAGAIN - O_NONBLOCK
    • Synchronized: Wait until data is on HW - O_SYNC
      • int fsync(int fd);
    • Asynchronous: Signal when ready - O_ASYNC
      • SIGIO handler
      • fcntl: F_GETSIG / F_SETSIG, F_SETOWN/F_GETOWN (process getting signal)
    • Direct: Directly from user buffer - O_DIRECT
  • 11. More File control
    • int unlink(const char *pathname);
    • int truncate(const char *path, off_t length);
      • int ftruncate(int fd, off_t length);
      • O_TRUNC on open
  • 12. Descriptor I/O
    • ssize_t read(int fd, void *buf, size_t count);
    • ssize_t write(int fd, const void *buf, size_t count);
    • off_t lseek(int fd, off_t offset, int whence);
      • SEEK_SET, SEEK_CUR, SEEK_END
    • EOF
  • 13. IO Vectors
    • ssize_t readv(int fd, const struct iovec *iov, int iovcnt);
    • ssize_t writev(int fd, const struct iovec *iov, int iovcnt);
    struct iovec { void *iov_base; /* Starting address */ size_t iov_len; /* Number of bytes to transfer */ };
  • 14. int echo_main(int argc, char **argv) { struct iovec io[argc]; struct iovec *cur_io = io; char *arg; char *p; ... while (1) { int c; cur_io->iov_base = p = arg; ... while ((c = *arg++)) { if (c == eflag) { /* Check for escape seq. */ if (*arg == 'c') { /* 'c' means cancel newline and ignore all subsequent chars. */ cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; goto ret; } ... c = bb_process_escape_sequence( (void*) &arg); } *p++ = c; } arg = *++argv; if (arg) *p++ = ' '; cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; if (!arg) break; } ret: return writev(1, io, (cur_io - io)) >= 0; }
  • 15. Memory Mapped file
    • void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
    • int munmap(void *start, size_t length);
    • Important flags:
      • No anonymous, MAP_SHARED, MAP_FIXED, MAP_POPULATE ( | MAP_NONBLOCK)
    • int msync(void *start, size_t length, int flags); // MS_SYNC or MS_ASYNC
    • void *mremap(void *old_address, size_t old_size, size_t new_size, int flags);
  • 16. Locking
    • Mandatory Locking (BSD)
      • ~S_IXGRP | SGID ( + mount flag MS_MANDLOCK)
      • Racy (mmap)
    • Advisory Locking
      • Both sides play nice
  • 17. Advisory Locking
    • int flock(int fd, int operation); // LOCK_SH, LOCK_EX, LOCK_UN
    • int lockf(int fd, int cmd, off_t len); // F_LOCK, T_LOCK, F_ULOCK, F_TEST
    • fcntl: F_GETLK, F_SETLK, F_SETLKW
      • High level of control (with offset, down to a single byte)
    struct flock { ... short l_type; /* Type of lock: F_RDLCK, F_WRLCK, F_UNLCK */ short l_whence; /* How to interpret l_start: SEEK_SET, SEEK_CUR, SEEK_END */ off_t l_start; /* Starting offset for lock */ off_t l_len; /* Number of bytes to lock */ pid_t l_pid; /* PID of process blocking our lock (F_GETLK only) */ ... };
  • 18. #ifdef F_SETLK #ifndef SEEK_SET #define SEEK_SET 0 #endif struct flock lock_data; lock_data.l_type = F_WRLCK; lock_data.l_whence = SEEK_SET; lock_data.l_start = lock_data.l_len = 0; if (fcntl(pidFd, F_SETLK, &lock_data) == -1) { if (errno == EAGAIN) return oldpid; else return -1; } #else #ifdef LOCK_EX if (flock (pidFd, LOCK_EX|LOCK_NB) == -1) { if (errno == EWOULDBLOCK) return oldpid; else return -1; } #else if (lockf (pidFd, F_TLOCK, 0) == -1) { if (errno == EACCES) return oldpid; else return -1; } #endif #endif }
  • 19. Buffered I/O
    • Streams: Buffer I/O and write to kernel at once
      • Better alignment
      • Less system calls
      • Yet another “cache”!!
      • FILE *
      • Formatting
    • FILE *fopen(const char *path, const char *mode);
    • FILE *fdopen(int fd, const char *mode);
    • int fclose(FILE *fp);
    • int fileno(FILE *stream);
  • 20. I/O
    • size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
    • size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
    • Formatted
      • int fprintf(FILE *stream, const char *format, ...);
      • int fscanf(FILE *stream, const char *format, ...);
    • Char
      • int fputc(int c, FILE *stream);
      • int fgetc(FILE *stream); -- int ungetc(int c, FILE *stream);
    • String
      • int fputs(const char *s, FILE *stream);
      • char *fgets(char *s, int size, FILE *stream);
  • 21. Behind the Scenes
    • Inherently thread-safe
    • To do your own locking (of the stream, not the file)
      • void flockfile(FILE *filehandle);
      • int ftrylockfile(FILE *filehandle);
      • void funlockfile(FILE *filehandle);
      • xxx_unlocked versions (e.g. fread_unlocked)
    • Flushing the stream (not the page cache)
      • int fflush(FILE *stream);
  • 22. Errors
    • int feof(FILE *stream);
    • int ferror(FILE *stream);
    • void clearerr(FILE *stream);
    • Descriptor ops can not distinguish EOF vs error
  • 23. Positioning
    • int fseek(FILE *stream, long offset, int whence);
    • long ftell(FILE *stream);
    • int fgetpos(FILE *stream, fpos_t *pos);
    • int fsetpos(FILE *stream, fpos_t *pos);
  • 24. Metadata
    • int fstat(int fd, struct stat *buf);
    • int stat(const char *path, struct stat *buf);
      • lstat : BSD only
      • Exec on all nodes in path
    struct stat { dev_t st_dev; /* ID of device containing file */ ino_t st_ino; /* inode number */ mode_t st_mode; /* protection */ nlink_t st_nlink; /* number of hard links */ uid_t st_uid; /* user ID of owner */ gid_t st_gid; /* group ID of owner */ dev_t st_rdev; /* device ID (if special file) */ off_t st_size; /* total size, in bytes */ blksize_t st_blksize; /* blocksize for filesystem I/O */ blkcnt_t st_blocks; /* number of blocks allocated */ time_t st_atime; /* time of last access */ time_t st_mtime; /* time of last modification */ time_t st_ctime; /* time of last status change */ };
  • 25. Directory Streams
    • A directory is a file whose entries are other inodes
    • DIR *opendir(const char *name);
    • int closedir(DIR *dir);
    • struct dirent *readdir(DIR *dir);
    struct dirent { ino_t d_ino; /* inode number */ off_t d_off; /* offset to the next dirent */ unsigned short d_reclen; /* length of this record */ unsigned char d_type; /* type of file */ char d_name[256]; /* filename */ };
  • 26. static pid_list *scan_proc_pids(inode_list *ilist) { DIR *d; struct dirent *de; pid_t pid; pid_list *plist; xchdir(&quot;/proc&quot;); d = opendir(&quot;/proc&quot;); if (!d) return NULL; plist = NULL; while ((de = readdir(d)) != NULL) { pid = (pid_t)bb_strtou(de->d_name, NULL, 10); if (errno) continue; if (chdir(de->d_name) < 0) continue; plist = scan_link(&quot;cwd&quot;, pid, ilist, plist); plist = scan_link(&quot;exe&quot;, pid, ilist, plist); plist = scan_link(&quot;root&quot;, pid, ilist, plist); .... } closedir(d); return plist; } static pid_list *scan_link(const char *lname, pid_t pid, inode_list *ilist, pid_list *plist) { ino_t inode; dev_t dev; if (!file_to_dev_inode(lname, &dev, &inode)) return plist; if (search_dev_inode(ilist, dev, inode)) plist = add_pid(plist, pid); return plist; } static int file_to_dev_inode(const char *filename, dev_t *dev, ino_t *inode) { struct stat f_stat; if (stat(filename, &f_stat)) return 0; *inode = f_stat.st_ino; *dev = f_stat.st_dev; return 1; } static int search_dev_inode(inode_list *ilist, dev_t dev, ino_t inode) { while (ilist) { if (ilist->dev == dev) { if (option_mask32 & OPT_MOUNT) return 1; if (ilist->inode == inode) return 1; } ilist = ilist->next; } return 0; }
  • 27. I/O Multiplexing
    • int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
    • int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
    • int poll(struct pollfd *fds, nfds_t nfds, int timeout);
    • int ppoll(struct pollfd *fds, nfds_vt nfds, const struct timespec *timeout, const sigset_t *sigmask);
      • POLLIN/POLLOUT/POLLPRI/POLLERR
    void FD_CLR(int fd, fd_set *set); int FD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); struct pollfd { int fd; /* file descriptor */ short events; /* requested events */ short revents; /* returned events */ };
  • 28. Epoll
    • Decouple interest set registration from poll
      • +: O(1) on the wait
      • +: Edge trigger
      • - : system call for adding onto the set
    • int epoll_create(int size); //desc, need close
    • int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
    • int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
    typedef union epoll_data { void *ptr; int fd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_t events; /* Epoll events */ epoll_data_t data; /* User data variable */ };
  • 29.  
  • 30. IOCTL
    • Device / special file control
    • int ioctl(int d, int request, ...);
    • Request is specific to device being controlled, and may have a payload (ioctl_list)
  • 31. Filesystem events
    • int inotify_init(void); // desc, need close
    • int inotify_add_watch(int fd, const char *pathname, uint32_t mask); // watch desc
    • int inotify_rm_watch(int fd, uint32_t wd);
    • FIONREAD ioctl
    • fcntl: F_NOTIFY
    struct inotify_event { int wd; /* watch descriptor */ uint32_t mask; /* mask of events */ uint32_t cookie; /* unique cookie */ uint32_t len; /* size of 'name' field */ char name[]; /* null-terminated name */ };
  • 32. int inotifyd_main(int argc UNUSED_PARAM, char **argv) { unsigned mask = IN_ALL_EVENTS; // assume we want all events struct pollfd pfd; char **watched = ++argv; // watched name list const char *args[] = { *argv, NULL, NULL, NULL, NULL }; // open inotify pfd.fd = inotify_init(); if (pfd.fd < 0) bb_perror_msg_and_die(&quot;no kernel support&quot;); // setup watched while (*++argv) { char *path = *argv; char *masks = strchr(path, ':'); int wd; // watch descriptor // if mask is specified -> if (masks) { *masks = ''; // split path and mask // convert mask names to mask bitset mask = 0; while (*++masks) { int i = strchr(mask_names, *masks) - mask_names; if (i >= 0) { mask |= (1 << i); } } } // add watch wd = inotify_add_watch(pfd.fd, path, mask); if (wd < 0) { bb_perror_msg_and_die(&quot;add watch (%s) failed&quot;, path); } } static const char mask_names[] ALIGN1 = &quot;a&quot; // 0x00000001 File was accessed &quot;c&quot; // 0x00000002 File was modified &quot;e&quot; // 0x00000004 Metadata changed &quot;w&quot; // 0x00000008 Writtable file was closed &quot;0&quot; // 0x00000010 Unwrittable file closed &quot;r&quot; // 0x00000020 File was opened &quot;m&quot; // 0x00000040 File was moved from X &quot;y&quot; // 0x00000080 File was moved to Y &quot;n&quot; // 0x00000100 Subfile was created &quot;d&quot; // 0x00000200 Subfile was deleted &quot;D&quot; // 0x00000400 Self was deleted &quot;M&quot; // 0x00000800 Self was moved ; pfd.events = POLLIN; while (!signalled && poll(&pfd, 1, -1) > 0) { ssize_t len; void *buf; struct inotify_event *ie; // read out all pending events xioctl(pfd.fd, FIONREAD, &len); #define eventbuf bb_common_bufsiz1 ie = buf = (len <= sizeof(eventbuf)) ? eventbuf : xmalloc(len); len = full_read(pfd.fd, buf, len); // process events. N.B. events may vary in length while (len > 0) { int i; char events[12]; char *s = events; unsigned m = ie->mask; for (i = 0; i < 12; ++i, m >>= 1) { if (m & 1) { *s++ = mask_names[i]; } } *s = ''; args[1] = events; args[2] = watched[ie->wd]; args[3] = ie->len ? ie->name : NULL; xspawn((char **)args); // next event i = sizeof(struct inotify_event) + ie->len; len -= i; ie = (void*)((char*)ie + i); } if (eventbuf != buf) free(buf); } return EXIT_SUCCESS; }
  • 33. Asynchronous I/O
    • Only on O_DIRECT
    struct aiocb { int aio_filedes; /* file descriptor * int aio_lio_opcode; /* operation to perform */ int aio_reqprio; /* request priority offset * volatile void *aio_buf; /* pointer to buffer */ size_t aio_nbytes; /* length of operation */ struct sigevent aio_sigevent; /* signal number and value */ /* internal, private members follow... */ }; int aio_read (struct aiocb *aiocbp); int aio_write (struct aiocb *aiocbp); int aio_error (const struct aiocb *aiocbp); int aio_return (struct aiocb *aiocbp); int aio_cancel (int fd, struct aiocb *aiocbp); int aio_fsync (int op, struct aiocb *aiocbp); int aio_suspend (const struct aiocb * const cblist[], int n, const struct timespec *timeout);