Sysprog 16
Upcoming SlideShare
Loading in...5
×
 

Sysprog 16

on

  • 2,579 views

 

Statistics

Views

Total Views
2,579
Views on SlideShare
2,579
Embed Views
0

Actions

Likes
0
Downloads
42
Comments
0

0 Embeds 0

No embeds

Accessibility

Categories

Upload Details

Uploaded via as OpenOffice

Usage Rights

© All Rights Reserved

Report content

Flagged as inappropriate Flag as inappropriate
Flag as inappropriate

Select your reason for flagging this presentation as inappropriate.

Cancel
  • Full Name Full Name Comment goes here.
    Are you sure you want to
    Your message goes here
    Processing…
Post Comment
Edit your comment

Sysprog 16 Sysprog 16 Presentation Transcript

  • C/C++ Linux System Programming
      • Session 16
      • User-space System Programming
      • – session 6
  • Outline
    • Filesystem concepts
    • File I/O Ops
  • Filesystem
    • Traditionally: An abstraction for storage device access
    • Why?
      • Common sensible organization
      • Encapsulate OS – HW interaction, e.g. performance considerations
  • VFS
    • Wider-range abstraction:
      • special FS, different types of disk FS, network FS
      • Common user interface
      • Multiple FS's
      • Common handling
  • Mounts
    • Superblocks – filesystem control block
    • Mount point
    • Syscalls
      • int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
      • int umount(const char *target);
  • FS Objects and Metadata
    • Inode – file control block
      • A unique ID
      • Access/Owner info
      • Memory maps
      • Block device info
    • Dirent – file as a directory entry (not physical)
    • File – file data and hook to meta (not physical)
  • Journaling
    • Problem:
      • operations on metadata are non-atomic, can be interrupted by power loss
    • Physical vs logical journals
    • Metadata-only journals
  • Disk Cache
    • Buffers
    • Page cache
    • Writeback – pdflush
    • Read-ahead
  • File Descriptors
    • Descriptors – index into process file table
    • int open(const char *pathname, int flags);
    • int open(const char *pathname, int flags, mode_t mode);
    • int creat(const char *pathname, mode_t mode);
      • Open with O_CREAT (disk files only)
    • int close(int fd); /* notice status !! */
  • File I/O modes
    • int fcntl(int fd, int cmd, long arg); // F_SETFL
    • Nonblocking: If not ready, EAGAIN - O_NONBLOCK
    • Synchronized: Wait until data is on HW - O_SYNC
      • int fsync(int fd);
    • Asynchronous: Signal when ready - O_ASYNC
      • SIGIO handler
      • fcntl: F_GETSIG / F_SETSIG, F_SETOWN/F_GETOWN (process getting signal)
    • Direct: Directly from user buffer - O_DIRECT
  • More File control
    • int unlink(const char *pathname);
    • int truncate(const char *path, off_t length);
      • int ftruncate(int fd, off_t length);
      • O_TRUNC on open
  • Descriptor I/O
    • ssize_t read(int fd, void *buf, size_t count);
    • ssize_t write(int fd, const void *buf, size_t count);
    • off_t lseek(int fd, off_t offset, int whence);
      • SEEK_SET, SEEK_CUR, SEEK_END
    • EOF
  • IO Vectors
    • ssize_t readv(int fd, const struct iovec *iov, int iovcnt);
    • ssize_t writev(int fd, const struct iovec *iov, int iovcnt);
    struct iovec { void *iov_base; /* Starting address */ size_t iov_len; /* Number of bytes to transfer */ };
  • int echo_main(int argc, char **argv) { struct iovec io[argc]; struct iovec *cur_io = io; char *arg; char *p; ... while (1) { int c; cur_io->iov_base = p = arg; ... while ((c = *arg++)) { if (c == eflag) { /* Check for escape seq. */ if (*arg == 'c') { /* 'c' means cancel newline and ignore all subsequent chars. */ cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; goto ret; } ... c = bb_process_escape_sequence( (void*) &arg); } *p++ = c; } arg = *++argv; if (arg) *p++ = ' '; cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; if (!arg) break; } ret: return writev(1, io, (cur_io - io)) >= 0; }
  • Memory Mapped file
    • void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
    • int munmap(void *start, size_t length);
    • Important flags:
      • No anonymous, MAP_SHARED, MAP_FIXED, MAP_POPULATE ( | MAP_NONBLOCK)
    • int msync(void *start, size_t length, int flags); // MS_SYNC or MS_ASYNC
    • void *mremap(void *old_address, size_t old_size, size_t new_size, int flags);
  • Locking
    • Mandatory Locking (BSD)
      • ~S_IXGRP | SGID ( + mount flag MS_MANDLOCK)
      • Racy (mmap)
    • Advisory Locking
      • Both sides play nice
  • Advisory Locking
    • int flock(int fd, int operation); // LOCK_SH, LOCK_EX, LOCK_UN
    • int lockf(int fd, int cmd, off_t len); // F_LOCK, T_LOCK, F_ULOCK, F_TEST
    • fcntl: F_GETLK, F_SETLK, F_SETLKW
      • High level of control (with offset, down to a single byte)
    struct flock { ... short l_type; /* Type of lock: F_RDLCK, F_WRLCK, F_UNLCK */ short l_whence; /* How to interpret l_start: SEEK_SET, SEEK_CUR, SEEK_END */ off_t l_start; /* Starting offset for lock */ off_t l_len; /* Number of bytes to lock */ pid_t l_pid; /* PID of process blocking our lock (F_GETLK only) */ ... };
  • #ifdef F_SETLK #ifndef SEEK_SET #define SEEK_SET 0 #endif struct flock lock_data; lock_data.l_type = F_WRLCK; lock_data.l_whence = SEEK_SET; lock_data.l_start = lock_data.l_len = 0; if (fcntl(pidFd, F_SETLK, &lock_data) == -1) { if (errno == EAGAIN) return oldpid; else return -1; } #else #ifdef LOCK_EX if (flock (pidFd, LOCK_EX|LOCK_NB) == -1) { if (errno == EWOULDBLOCK) return oldpid; else return -1; } #else if (lockf (pidFd, F_TLOCK, 0) == -1) { if (errno == EACCES) return oldpid; else return -1; } #endif #endif }
  • Buffered I/O
    • Streams: Buffer I/O and write to kernel at once
      • Better alignment
      • Less system calls
      • Yet another “cache”!!
      • FILE *
      • Formatting
    • FILE *fopen(const char *path, const char *mode);
    • FILE *fdopen(int fd, const char *mode);
    • int fclose(FILE *fp);
    • int fileno(FILE *stream);
  • I/O
    • size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
    • size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
    • Formatted
      • int fprintf(FILE *stream, const char *format, ...);
      • int fscanf(FILE *stream, const char *format, ...);
    • Char
      • int fputc(int c, FILE *stream);
      • int fgetc(FILE *stream); -- int ungetc(int c, FILE *stream);
    • String
      • int fputs(const char *s, FILE *stream);
      • char *fgets(char *s, int size, FILE *stream);
  • Behind the Scenes
    • Inherently thread-safe
    • To do your own locking (of the stream, not the file)
      • void flockfile(FILE *filehandle);
      • int ftrylockfile(FILE *filehandle);
      • void funlockfile(FILE *filehandle);
      • xxx_unlocked versions (e.g. fread_unlocked)
    • Flushing the stream (not the page cache)
      • int fflush(FILE *stream);
  • Errors
    • int feof(FILE *stream);
    • int ferror(FILE *stream);
    • void clearerr(FILE *stream);
    • Descriptor ops can not distinguish EOF vs error
  • Positioning
    • int fseek(FILE *stream, long offset, int whence);
    • long ftell(FILE *stream);
    • int fgetpos(FILE *stream, fpos_t *pos);
    • int fsetpos(FILE *stream, fpos_t *pos);
  • Metadata
    • int fstat(int fd, struct stat *buf);
    • int stat(const char *path, struct stat *buf);
      • lstat : BSD only
      • Exec on all nodes in path
    struct stat { dev_t st_dev; /* ID of device containing file */ ino_t st_ino; /* inode number */ mode_t st_mode; /* protection */ nlink_t st_nlink; /* number of hard links */ uid_t st_uid; /* user ID of owner */ gid_t st_gid; /* group ID of owner */ dev_t st_rdev; /* device ID (if special file) */ off_t st_size; /* total size, in bytes */ blksize_t st_blksize; /* blocksize for filesystem I/O */ blkcnt_t st_blocks; /* number of blocks allocated */ time_t st_atime; /* time of last access */ time_t st_mtime; /* time of last modification */ time_t st_ctime; /* time of last status change */ };
  • Directory Streams
    • A directory is a file whose entries are other inodes
    • DIR *opendir(const char *name);
    • int closedir(DIR *dir);
    • struct dirent *readdir(DIR *dir);
    struct dirent { ino_t d_ino; /* inode number */ off_t d_off; /* offset to the next dirent */ unsigned short d_reclen; /* length of this record */ unsigned char d_type; /* type of file */ char d_name[256]; /* filename */ };
  • static pid_list *scan_proc_pids(inode_list *ilist) { DIR *d; struct dirent *de; pid_t pid; pid_list *plist; xchdir(&quot;/proc&quot;); d = opendir(&quot;/proc&quot;); if (!d) return NULL; plist = NULL; while ((de = readdir(d)) != NULL) { pid = (pid_t)bb_strtou(de->d_name, NULL, 10); if (errno) continue; if (chdir(de->d_name) < 0) continue; plist = scan_link(&quot;cwd&quot;, pid, ilist, plist); plist = scan_link(&quot;exe&quot;, pid, ilist, plist); plist = scan_link(&quot;root&quot;, pid, ilist, plist); .... } closedir(d); return plist; } static pid_list *scan_link(const char *lname, pid_t pid, inode_list *ilist, pid_list *plist) { ino_t inode; dev_t dev; if (!file_to_dev_inode(lname, &dev, &inode)) return plist; if (search_dev_inode(ilist, dev, inode)) plist = add_pid(plist, pid); return plist; } static int file_to_dev_inode(const char *filename, dev_t *dev, ino_t *inode) { struct stat f_stat; if (stat(filename, &f_stat)) return 0; *inode = f_stat.st_ino; *dev = f_stat.st_dev; return 1; } static int search_dev_inode(inode_list *ilist, dev_t dev, ino_t inode) { while (ilist) { if (ilist->dev == dev) { if (option_mask32 & OPT_MOUNT) return 1; if (ilist->inode == inode) return 1; } ilist = ilist->next; } return 0; }
  • I/O Multiplexing
    • int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
    • int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
    • int poll(struct pollfd *fds, nfds_t nfds, int timeout);
    • int ppoll(struct pollfd *fds, nfds_vt nfds, const struct timespec *timeout, const sigset_t *sigmask);
      • POLLIN/POLLOUT/POLLPRI/POLLERR
    void FD_CLR(int fd, fd_set *set); int FD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); struct pollfd { int fd; /* file descriptor */ short events; /* requested events */ short revents; /* returned events */ };
  • Epoll
    • Decouple interest set registration from poll
      • +: O(1) on the wait
      • +: Edge trigger
      • - : system call for adding onto the set
    • int epoll_create(int size); //desc, need close
    • int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
    • int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
    typedef union epoll_data { void *ptr; int fd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_t events; /* Epoll events */ epoll_data_t data; /* User data variable */ };
  •  
  • IOCTL
    • Device / special file control
    • int ioctl(int d, int request, ...);
    • Request is specific to device being controlled, and may have a payload (ioctl_list)
  • Filesystem events
    • int inotify_init(void); // desc, need close
    • int inotify_add_watch(int fd, const char *pathname, uint32_t mask); // watch desc
    • int inotify_rm_watch(int fd, uint32_t wd);
    • FIONREAD ioctl
    • fcntl: F_NOTIFY
    struct inotify_event { int wd; /* watch descriptor */ uint32_t mask; /* mask of events */ uint32_t cookie; /* unique cookie */ uint32_t len; /* size of 'name' field */ char name[]; /* null-terminated name */ };
  • int inotifyd_main(int argc UNUSED_PARAM, char **argv) { unsigned mask = IN_ALL_EVENTS; // assume we want all events struct pollfd pfd; char **watched = ++argv; // watched name list const char *args[] = { *argv, NULL, NULL, NULL, NULL }; // open inotify pfd.fd = inotify_init(); if (pfd.fd < 0) bb_perror_msg_and_die(&quot;no kernel support&quot;); // setup watched while (*++argv) { char *path = *argv; char *masks = strchr(path, ':'); int wd; // watch descriptor // if mask is specified -> if (masks) { *masks = ''; // split path and mask // convert mask names to mask bitset mask = 0; while (*++masks) { int i = strchr(mask_names, *masks) - mask_names; if (i >= 0) { mask |= (1 << i); } } } // add watch wd = inotify_add_watch(pfd.fd, path, mask); if (wd < 0) { bb_perror_msg_and_die(&quot;add watch (%s) failed&quot;, path); } } static const char mask_names[] ALIGN1 = &quot;a&quot; // 0x00000001 File was accessed &quot;c&quot; // 0x00000002 File was modified &quot;e&quot; // 0x00000004 Metadata changed &quot;w&quot; // 0x00000008 Writtable file was closed &quot;0&quot; // 0x00000010 Unwrittable file closed &quot;r&quot; // 0x00000020 File was opened &quot;m&quot; // 0x00000040 File was moved from X &quot;y&quot; // 0x00000080 File was moved to Y &quot;n&quot; // 0x00000100 Subfile was created &quot;d&quot; // 0x00000200 Subfile was deleted &quot;D&quot; // 0x00000400 Self was deleted &quot;M&quot; // 0x00000800 Self was moved ; pfd.events = POLLIN; while (!signalled && poll(&pfd, 1, -1) > 0) { ssize_t len; void *buf; struct inotify_event *ie; // read out all pending events xioctl(pfd.fd, FIONREAD, &len); #define eventbuf bb_common_bufsiz1 ie = buf = (len <= sizeof(eventbuf)) ? eventbuf : xmalloc(len); len = full_read(pfd.fd, buf, len); // process events. N.B. events may vary in length while (len > 0) { int i; char events[12]; char *s = events; unsigned m = ie->mask; for (i = 0; i < 12; ++i, m >>= 1) { if (m & 1) { *s++ = mask_names[i]; } } *s = ''; args[1] = events; args[2] = watched[ie->wd]; args[3] = ie->len ? ie->name : NULL; xspawn((char **)args); // next event i = sizeof(struct inotify_event) + ie->len; len -= i; ie = (void*)((char*)ie + i); } if (eventbuf != buf) free(buf); } return EXIT_SUCCESS; }
  • Asynchronous I/O
    • Only on O_DIRECT
    struct aiocb { int aio_filedes; /* file descriptor * int aio_lio_opcode; /* operation to perform */ int aio_reqprio; /* request priority offset * volatile void *aio_buf; /* pointer to buffer */ size_t aio_nbytes; /* length of operation */ struct sigevent aio_sigevent; /* signal number and value */ /* internal, private members follow... */ }; int aio_read (struct aiocb *aiocbp); int aio_write (struct aiocb *aiocbp); int aio_error (const struct aiocb *aiocbp); int aio_return (struct aiocb *aiocbp); int aio_cancel (int fd, struct aiocb *aiocbp); int aio_fsync (int op, struct aiocb *aiocbp); int aio_suspend (const struct aiocb * const cblist[], int n, const struct timespec *timeout);