C/C++ Linux System Programming Session 16 User-space System Programming  – session 6
Outline Filesystem concepts File I/O Ops
Filesystem Traditionally: An abstraction for storage device access Why? Common sensible organization Encapsulate OS – HW interaction, e.g. performance considerations
VFS  Wider-range abstraction:  special FS, different types of disk FS, network FS Common user interface Multiple FS's Common handling
Mounts Superblocks – filesystem control block Mount point Syscalls int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data); int umount(const char *target);
FS Objects and Metadata Inode – file control block A unique ID Access/Owner info Memory maps Block device info Dirent – file as a directory entry (not physical) File – file data and hook to meta (not physical)
Journaling Problem: operations on metadata are non-atomic, can be interrupted by power loss Physical vs logical journals Metadata-only journals
Disk Cache Buffers Page cache Writeback – pdflush Read-ahead
File Descriptors Descriptors – index into process file table int open(const char *pathname, int flags); int open(const char *pathname, int flags, mode_t mode); int creat(const char *pathname, mode_t mode);  Open with O_CREAT (disk files only) int close(int fd); /* notice status !! */
File I/O modes int fcntl(int fd, int cmd, long arg); // F_SETFL Nonblocking: If not ready, EAGAIN - O_NONBLOCK  Synchronized: Wait until data is on HW - O_SYNC int fsync(int fd); Asynchronous: Signal when ready - O_ASYNC SIGIO handler  fcntl: F_GETSIG / F_SETSIG, F_SETOWN/F_GETOWN (process getting signal) Direct: Directly from user buffer - O_DIRECT
More File control int unlink(const char *pathname); int truncate(const char *path, off_t length); int ftruncate(int fd, off_t length); O_TRUNC on open
Descriptor I/O ssize_t read(int fd, void *buf, size_t count); ssize_t write(int fd, const void *buf, size_t count); off_t lseek(int fd, off_t offset, int whence); SEEK_SET, SEEK_CUR, SEEK_END EOF
IO Vectors ssize_t readv(int fd, const struct iovec *iov, int iovcnt); ssize_t writev(int fd, const struct iovec *iov, int iovcnt); struct iovec { void  *iov_base;  /* Starting address */ size_t iov_len;  /* Number of bytes to transfer */ };
int echo_main(int argc, char **argv) { struct iovec io[argc]; struct iovec *cur_io = io; char *arg; char *p; ... while (1) { int c; cur_io->iov_base = p = arg; ... while ((c = *arg++)) { if (c == eflag) { /* Check for escape seq. */ if (*arg == 'c') { /* '\c' means cancel newline and ignore all subsequent chars. */ cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; goto ret; } ... c = bb_process_escape_sequence( (void*) &arg); } *p++ = c; } arg = *++argv; if (arg) *p++ = ' '; cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; if (!arg) break; } ret: return writev(1, io, (cur_io - io)) >= 0; }
Memory Mapped file void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset); int munmap(void *start, size_t length); Important flags: No anonymous, MAP_SHARED, MAP_FIXED, MAP_POPULATE ( | MAP_NONBLOCK) int msync(void *start, size_t length, int flags); // MS_SYNC or MS_ASYNC void *mremap(void *old_address, size_t old_size, size_t new_size, int flags);
Locking Mandatory Locking (BSD) ~S_IXGRP | SGID ( + mount flag MS_MANDLOCK) Racy (mmap) Advisory Locking Both sides play nice
Advisory Locking int flock(int fd, int operation); // LOCK_SH, LOCK_EX, LOCK_UN int lockf(int fd, int cmd, off_t len); // F_LOCK, T_LOCK, F_ULOCK, F_TEST fcntl: F_GETLK, F_SETLK, F_SETLKW High level of control (with offset, down to a single byte) struct flock { ... short l_type;  /* Type of lock: F_RDLCK, F_WRLCK, F_UNLCK */ short l_whence;  /* How to interpret l_start: SEEK_SET, SEEK_CUR, SEEK_END */ off_t l_start;  /* Starting offset for lock */ off_t l_len;  /* Number of bytes to lock */ pid_t l_pid;  /* PID of process blocking our lock (F_GETLK only) */ ... };
#ifdef F_SETLK #ifndef SEEK_SET #define SEEK_SET 0 #endif struct flock lock_data; lock_data.l_type = F_WRLCK; lock_data.l_whence = SEEK_SET; lock_data.l_start = lock_data.l_len = 0; if (fcntl(pidFd, F_SETLK, &lock_data) == -1) { if (errno == EAGAIN) return oldpid; else return -1; } #else #ifdef LOCK_EX if (flock (pidFd, LOCK_EX|LOCK_NB) == -1) { if (errno == EWOULDBLOCK) return oldpid; else return -1; } #else if (lockf (pidFd, F_TLOCK, 0) == -1) { if (errno == EACCES) return oldpid; else return -1; } #endif #endif }
Buffered I/O Streams:  Buffer I/O and write to kernel at once Better alignment Less system calls Yet another “cache”!! FILE * Formatting  FILE *fopen(const char *path, const char *mode); FILE *fdopen(int fd, const char *mode); int fclose(FILE *fp); int fileno(FILE *stream);
I/O size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream); size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); Formatted int fprintf(FILE *stream, const char *format, ...); int fscanf(FILE *stream, const char *format, ...); Char int fputc(int c, FILE *stream); int fgetc(FILE *stream);  -- int ungetc(int c, FILE *stream); String int fputs(const char *s, FILE *stream); char *fgets(char *s, int size, FILE *stream);
Behind the Scenes Inherently thread-safe To do your own locking (of the stream, not the file) void flockfile(FILE *filehandle); int ftrylockfile(FILE *filehandle); void funlockfile(FILE *filehandle); xxx_unlocked versions (e.g. fread_unlocked) Flushing the stream (not the page cache)  int fflush(FILE *stream);
Errors int feof(FILE *stream); int ferror(FILE *stream); void clearerr(FILE *stream); Descriptor ops can not distinguish EOF vs error
Positioning int fseek(FILE *stream, long offset, int whence); long ftell(FILE *stream); int fgetpos(FILE *stream, fpos_t *pos); int fsetpos(FILE *stream, fpos_t *pos);
Metadata int fstat(int fd, struct stat *buf); int stat(const char *path, struct stat *buf); lstat : BSD only Exec on all nodes in path struct stat { dev_t  st_dev;  /* ID of device containing file */ ino_t  st_ino;  /* inode number */ mode_t  st_mode;  /* protection */ nlink_t  st_nlink;  /* number of hard links */ uid_t  st_uid;  /* user ID of owner */ gid_t  st_gid;  /* group ID of owner */ dev_t  st_rdev;  /* device ID (if special file) */ off_t  st_size;  /* total size, in bytes */ blksize_t st_blksize; /* blocksize for filesystem I/O */ blkcnt_t  st_blocks;  /* number of blocks allocated */ time_t  st_atime;  /* time of last access */ time_t  st_mtime;  /* time of last modification */ time_t  st_ctime;  /* time of last status change */ };
Directory Streams A directory is a file whose entries are other inodes DIR *opendir(const char *name); int closedir(DIR *dir); struct dirent *readdir(DIR *dir); struct dirent { ino_t  d_ino;  /* inode number */ off_t  d_off;  /* offset to the next dirent */ unsigned short d_reclen;  /* length of this record */ unsigned char  d_type;  /* type of file */ char  d_name[256]; /* filename */ };
static pid_list *scan_proc_pids(inode_list *ilist) { DIR *d; struct dirent *de; pid_t pid; pid_list *plist; xchdir(&quot;/proc&quot;); d = opendir(&quot;/proc&quot;); if (!d) return NULL; plist = NULL; while ((de = readdir(d)) != NULL) { pid = (pid_t)bb_strtou(de->d_name, NULL, 10); if (errno) continue; if (chdir(de->d_name) < 0) continue; plist = scan_link(&quot;cwd&quot;, pid, ilist, plist); plist = scan_link(&quot;exe&quot;, pid, ilist, plist); plist = scan_link(&quot;root&quot;, pid, ilist, plist); .... } closedir(d); return plist; } static pid_list *scan_link(const char *lname, pid_t pid, inode_list *ilist, pid_list *plist) { ino_t inode; dev_t dev; if (!file_to_dev_inode(lname, &dev, &inode)) return plist; if (search_dev_inode(ilist, dev, inode)) plist = add_pid(plist, pid); return plist; } static int file_to_dev_inode(const char *filename, dev_t *dev, ino_t *inode) { struct stat f_stat; if (stat(filename, &f_stat)) return 0; *inode = f_stat.st_ino; *dev = f_stat.st_dev; return 1; } static int search_dev_inode(inode_list *ilist, dev_t dev, ino_t inode) { while (ilist) { if (ilist->dev == dev) { if (option_mask32 & OPT_MOUNT) return 1; if (ilist->inode == inode) return 1; } ilist = ilist->next; } return 0; }
I/O Multiplexing int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout); int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask); int poll(struct pollfd *fds, nfds_t nfds, int timeout); int ppoll(struct pollfd *fds, nfds_vt nfds, const struct timespec *timeout, const sigset_t *sigmask); POLLIN/POLLOUT/POLLPRI/POLLERR void FD_CLR(int fd, fd_set *set); int  FD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); struct pollfd { int  fd;  /* file descriptor */ short events;  /* requested events */ short revents;  /* returned events */ };
Epoll  Decouple interest set registration from poll +: O(1) on the wait +: Edge trigger - : system call for adding onto the set int epoll_create(int size); //desc, need close int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event); int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout); typedef union epoll_data { void  *ptr; int  fd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_t  events;  /* Epoll events */ epoll_data_t data;  /* User data variable */ };
 
IOCTL Device / special file control int ioctl(int d, int request, ...); Request is specific to device being controlled, and may have a payload (ioctl_list)
Filesystem events int inotify_init(void); // desc, need close int inotify_add_watch(int fd, const char *pathname, uint32_t mask); // watch desc int inotify_rm_watch(int fd, uint32_t wd); FIONREAD ioctl fcntl: F_NOTIFY struct inotify_event { int wd;  /* watch descriptor */ uint32_t mask;  /* mask of events */ uint32_t cookie; /* unique cookie */ uint32_t len;  /* size of 'name' field */ char name[];  /* null-terminated name */ };
int inotifyd_main(int argc UNUSED_PARAM, char **argv) { unsigned mask = IN_ALL_EVENTS; // assume we want all events struct pollfd pfd; char **watched = ++argv; // watched name list const char *args[] = { *argv, NULL, NULL, NULL, NULL }; // open inotify pfd.fd = inotify_init(); if (pfd.fd < 0) bb_perror_msg_and_die(&quot;no kernel support&quot;); // setup watched while (*++argv) { char *path = *argv; char *masks = strchr(path, ':'); int wd; // watch descriptor // if mask is specified -> if (masks) { *masks = '\0'; // split path and mask // convert mask names to mask bitset mask = 0; while (*++masks) { int i = strchr(mask_names, *masks) - mask_names;   if (i >= 0) { mask |= (1 << i); } } } // add watch wd = inotify_add_watch(pfd.fd, path, mask); if (wd < 0) { bb_perror_msg_and_die(&quot;add watch (%s) failed&quot;, path); } } static const char mask_names[] ALIGN1 = &quot;a&quot; // 0x00000001 File was accessed &quot;c&quot; // 0x00000002 File was modified &quot;e&quot; // 0x00000004 Metadata changed &quot;w&quot; // 0x00000008 Writtable file was closed &quot;0&quot; // 0x00000010 Unwrittable file closed &quot;r&quot; // 0x00000020 File was opened &quot;m&quot; // 0x00000040 File was moved from X &quot;y&quot; // 0x00000080 File was moved to Y &quot;n&quot; // 0x00000100 Subfile was created &quot;d&quot; // 0x00000200 Subfile was deleted &quot;D&quot; // 0x00000400 Self was deleted &quot;M&quot; // 0x00000800 Self was moved ; pfd.events = POLLIN; while (!signalled && poll(&pfd, 1, -1) > 0) { ssize_t len; void *buf; struct inotify_event *ie; // read out all pending events xioctl(pfd.fd, FIONREAD, &len); #define eventbuf bb_common_bufsiz1 ie = buf = (len <= sizeof(eventbuf)) ? eventbuf : xmalloc(len); len = full_read(pfd.fd, buf, len); // process events. N.B. events may vary in length while (len > 0) { int i; char events[12]; char *s = events; unsigned m = ie->mask; for (i = 0; i < 12; ++i, m >>= 1) { if (m & 1) { *s++ = mask_names[i]; } } *s = '\0'; args[1] = events; args[2] = watched[ie->wd]; args[3] = ie->len ? ie->name : NULL; xspawn((char **)args); // next event i = sizeof(struct inotify_event) + ie->len; len -= i; ie = (void*)((char*)ie + i); } if (eventbuf != buf) free(buf); } return EXIT_SUCCESS; }
Asynchronous I/O Only on O_DIRECT struct aiocb { int aio_filedes;  /* file descriptor * int aio_lio_opcode;  /* operation to perform */ int aio_reqprio;  /* request priority offset * volatile void *aio_buf;  /* pointer to buffer */ size_t aio_nbytes;  /* length of operation */ struct sigevent aio_sigevent; /* signal number and value */ /* internal, private members follow... */ }; int aio_read (struct aiocb *aiocbp); int aio_write (struct aiocb *aiocbp); int aio_error (const struct aiocb *aiocbp); int aio_return (struct aiocb *aiocbp); int aio_cancel (int fd, struct aiocb *aiocbp); int aio_fsync (int op, struct aiocb *aiocbp); int aio_suspend (const struct aiocb * const cblist[], int n, const struct timespec *timeout);

Sysprog 16

  • 1.
    C/C++ Linux SystemProgramming Session 16 User-space System Programming – session 6
  • 2.
  • 3.
    Filesystem Traditionally: Anabstraction for storage device access Why? Common sensible organization Encapsulate OS – HW interaction, e.g. performance considerations
  • 4.
    VFS Wider-rangeabstraction: special FS, different types of disk FS, network FS Common user interface Multiple FS's Common handling
  • 5.
    Mounts Superblocks –filesystem control block Mount point Syscalls int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data); int umount(const char *target);
  • 6.
    FS Objects andMetadata Inode – file control block A unique ID Access/Owner info Memory maps Block device info Dirent – file as a directory entry (not physical) File – file data and hook to meta (not physical)
  • 7.
    Journaling Problem: operationson metadata are non-atomic, can be interrupted by power loss Physical vs logical journals Metadata-only journals
  • 8.
    Disk Cache BuffersPage cache Writeback – pdflush Read-ahead
  • 9.
    File Descriptors Descriptors– index into process file table int open(const char *pathname, int flags); int open(const char *pathname, int flags, mode_t mode); int creat(const char *pathname, mode_t mode); Open with O_CREAT (disk files only) int close(int fd); /* notice status !! */
  • 10.
    File I/O modesint fcntl(int fd, int cmd, long arg); // F_SETFL Nonblocking: If not ready, EAGAIN - O_NONBLOCK Synchronized: Wait until data is on HW - O_SYNC int fsync(int fd); Asynchronous: Signal when ready - O_ASYNC SIGIO handler fcntl: F_GETSIG / F_SETSIG, F_SETOWN/F_GETOWN (process getting signal) Direct: Directly from user buffer - O_DIRECT
  • 11.
    More File controlint unlink(const char *pathname); int truncate(const char *path, off_t length); int ftruncate(int fd, off_t length); O_TRUNC on open
  • 12.
    Descriptor I/O ssize_tread(int fd, void *buf, size_t count); ssize_t write(int fd, const void *buf, size_t count); off_t lseek(int fd, off_t offset, int whence); SEEK_SET, SEEK_CUR, SEEK_END EOF
  • 13.
    IO Vectors ssize_treadv(int fd, const struct iovec *iov, int iovcnt); ssize_t writev(int fd, const struct iovec *iov, int iovcnt); struct iovec { void *iov_base; /* Starting address */ size_t iov_len; /* Number of bytes to transfer */ };
  • 14.
    int echo_main(int argc,char **argv) { struct iovec io[argc]; struct iovec *cur_io = io; char *arg; char *p; ... while (1) { int c; cur_io->iov_base = p = arg; ... while ((c = *arg++)) { if (c == eflag) { /* Check for escape seq. */ if (*arg == 'c') { /* '\c' means cancel newline and ignore all subsequent chars. */ cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; goto ret; } ... c = bb_process_escape_sequence( (void*) &arg); } *p++ = c; } arg = *++argv; if (arg) *p++ = ' '; cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; if (!arg) break; } ret: return writev(1, io, (cur_io - io)) >= 0; }
  • 15.
    Memory Mapped filevoid *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset); int munmap(void *start, size_t length); Important flags: No anonymous, MAP_SHARED, MAP_FIXED, MAP_POPULATE ( | MAP_NONBLOCK) int msync(void *start, size_t length, int flags); // MS_SYNC or MS_ASYNC void *mremap(void *old_address, size_t old_size, size_t new_size, int flags);
  • 16.
    Locking Mandatory Locking(BSD) ~S_IXGRP | SGID ( + mount flag MS_MANDLOCK) Racy (mmap) Advisory Locking Both sides play nice
  • 17.
    Advisory Locking intflock(int fd, int operation); // LOCK_SH, LOCK_EX, LOCK_UN int lockf(int fd, int cmd, off_t len); // F_LOCK, T_LOCK, F_ULOCK, F_TEST fcntl: F_GETLK, F_SETLK, F_SETLKW High level of control (with offset, down to a single byte) struct flock { ... short l_type; /* Type of lock: F_RDLCK, F_WRLCK, F_UNLCK */ short l_whence; /* How to interpret l_start: SEEK_SET, SEEK_CUR, SEEK_END */ off_t l_start; /* Starting offset for lock */ off_t l_len; /* Number of bytes to lock */ pid_t l_pid; /* PID of process blocking our lock (F_GETLK only) */ ... };
  • 18.
    #ifdef F_SETLK #ifndefSEEK_SET #define SEEK_SET 0 #endif struct flock lock_data; lock_data.l_type = F_WRLCK; lock_data.l_whence = SEEK_SET; lock_data.l_start = lock_data.l_len = 0; if (fcntl(pidFd, F_SETLK, &lock_data) == -1) { if (errno == EAGAIN) return oldpid; else return -1; } #else #ifdef LOCK_EX if (flock (pidFd, LOCK_EX|LOCK_NB) == -1) { if (errno == EWOULDBLOCK) return oldpid; else return -1; } #else if (lockf (pidFd, F_TLOCK, 0) == -1) { if (errno == EACCES) return oldpid; else return -1; } #endif #endif }
  • 19.
    Buffered I/O Streams: Buffer I/O and write to kernel at once Better alignment Less system calls Yet another “cache”!! FILE * Formatting FILE *fopen(const char *path, const char *mode); FILE *fdopen(int fd, const char *mode); int fclose(FILE *fp); int fileno(FILE *stream);
  • 20.
    I/O size_t fread(void*ptr, size_t size, size_t nmemb, FILE *stream); size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); Formatted int fprintf(FILE *stream, const char *format, ...); int fscanf(FILE *stream, const char *format, ...); Char int fputc(int c, FILE *stream); int fgetc(FILE *stream); -- int ungetc(int c, FILE *stream); String int fputs(const char *s, FILE *stream); char *fgets(char *s, int size, FILE *stream);
  • 21.
    Behind the ScenesInherently thread-safe To do your own locking (of the stream, not the file) void flockfile(FILE *filehandle); int ftrylockfile(FILE *filehandle); void funlockfile(FILE *filehandle); xxx_unlocked versions (e.g. fread_unlocked) Flushing the stream (not the page cache) int fflush(FILE *stream);
  • 22.
    Errors int feof(FILE*stream); int ferror(FILE *stream); void clearerr(FILE *stream); Descriptor ops can not distinguish EOF vs error
  • 23.
    Positioning int fseek(FILE*stream, long offset, int whence); long ftell(FILE *stream); int fgetpos(FILE *stream, fpos_t *pos); int fsetpos(FILE *stream, fpos_t *pos);
  • 24.
    Metadata int fstat(intfd, struct stat *buf); int stat(const char *path, struct stat *buf); lstat : BSD only Exec on all nodes in path struct stat { dev_t st_dev; /* ID of device containing file */ ino_t st_ino; /* inode number */ mode_t st_mode; /* protection */ nlink_t st_nlink; /* number of hard links */ uid_t st_uid; /* user ID of owner */ gid_t st_gid; /* group ID of owner */ dev_t st_rdev; /* device ID (if special file) */ off_t st_size; /* total size, in bytes */ blksize_t st_blksize; /* blocksize for filesystem I/O */ blkcnt_t st_blocks; /* number of blocks allocated */ time_t st_atime; /* time of last access */ time_t st_mtime; /* time of last modification */ time_t st_ctime; /* time of last status change */ };
  • 25.
    Directory Streams Adirectory is a file whose entries are other inodes DIR *opendir(const char *name); int closedir(DIR *dir); struct dirent *readdir(DIR *dir); struct dirent { ino_t d_ino; /* inode number */ off_t d_off; /* offset to the next dirent */ unsigned short d_reclen; /* length of this record */ unsigned char d_type; /* type of file */ char d_name[256]; /* filename */ };
  • 26.
    static pid_list *scan_proc_pids(inode_list*ilist) { DIR *d; struct dirent *de; pid_t pid; pid_list *plist; xchdir(&quot;/proc&quot;); d = opendir(&quot;/proc&quot;); if (!d) return NULL; plist = NULL; while ((de = readdir(d)) != NULL) { pid = (pid_t)bb_strtou(de->d_name, NULL, 10); if (errno) continue; if (chdir(de->d_name) < 0) continue; plist = scan_link(&quot;cwd&quot;, pid, ilist, plist); plist = scan_link(&quot;exe&quot;, pid, ilist, plist); plist = scan_link(&quot;root&quot;, pid, ilist, plist); .... } closedir(d); return plist; } static pid_list *scan_link(const char *lname, pid_t pid, inode_list *ilist, pid_list *plist) { ino_t inode; dev_t dev; if (!file_to_dev_inode(lname, &dev, &inode)) return plist; if (search_dev_inode(ilist, dev, inode)) plist = add_pid(plist, pid); return plist; } static int file_to_dev_inode(const char *filename, dev_t *dev, ino_t *inode) { struct stat f_stat; if (stat(filename, &f_stat)) return 0; *inode = f_stat.st_ino; *dev = f_stat.st_dev; return 1; } static int search_dev_inode(inode_list *ilist, dev_t dev, ino_t inode) { while (ilist) { if (ilist->dev == dev) { if (option_mask32 & OPT_MOUNT) return 1; if (ilist->inode == inode) return 1; } ilist = ilist->next; } return 0; }
  • 27.
    I/O Multiplexing intselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout); int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask); int poll(struct pollfd *fds, nfds_t nfds, int timeout); int ppoll(struct pollfd *fds, nfds_vt nfds, const struct timespec *timeout, const sigset_t *sigmask); POLLIN/POLLOUT/POLLPRI/POLLERR void FD_CLR(int fd, fd_set *set); int FD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); struct pollfd { int fd; /* file descriptor */ short events; /* requested events */ short revents; /* returned events */ };
  • 28.
    Epoll Decoupleinterest set registration from poll +: O(1) on the wait +: Edge trigger - : system call for adding onto the set int epoll_create(int size); //desc, need close int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event); int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout); typedef union epoll_data { void *ptr; int fd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_t events; /* Epoll events */ epoll_data_t data; /* User data variable */ };
  • 29.
  • 30.
    IOCTL Device /special file control int ioctl(int d, int request, ...); Request is specific to device being controlled, and may have a payload (ioctl_list)
  • 31.
    Filesystem events intinotify_init(void); // desc, need close int inotify_add_watch(int fd, const char *pathname, uint32_t mask); // watch desc int inotify_rm_watch(int fd, uint32_t wd); FIONREAD ioctl fcntl: F_NOTIFY struct inotify_event { int wd; /* watch descriptor */ uint32_t mask; /* mask of events */ uint32_t cookie; /* unique cookie */ uint32_t len; /* size of 'name' field */ char name[]; /* null-terminated name */ };
  • 32.
    int inotifyd_main(int argcUNUSED_PARAM, char **argv) { unsigned mask = IN_ALL_EVENTS; // assume we want all events struct pollfd pfd; char **watched = ++argv; // watched name list const char *args[] = { *argv, NULL, NULL, NULL, NULL }; // open inotify pfd.fd = inotify_init(); if (pfd.fd < 0) bb_perror_msg_and_die(&quot;no kernel support&quot;); // setup watched while (*++argv) { char *path = *argv; char *masks = strchr(path, ':'); int wd; // watch descriptor // if mask is specified -> if (masks) { *masks = '\0'; // split path and mask // convert mask names to mask bitset mask = 0; while (*++masks) { int i = strchr(mask_names, *masks) - mask_names; if (i >= 0) { mask |= (1 << i); } } } // add watch wd = inotify_add_watch(pfd.fd, path, mask); if (wd < 0) { bb_perror_msg_and_die(&quot;add watch (%s) failed&quot;, path); } } static const char mask_names[] ALIGN1 = &quot;a&quot; // 0x00000001 File was accessed &quot;c&quot; // 0x00000002 File was modified &quot;e&quot; // 0x00000004 Metadata changed &quot;w&quot; // 0x00000008 Writtable file was closed &quot;0&quot; // 0x00000010 Unwrittable file closed &quot;r&quot; // 0x00000020 File was opened &quot;m&quot; // 0x00000040 File was moved from X &quot;y&quot; // 0x00000080 File was moved to Y &quot;n&quot; // 0x00000100 Subfile was created &quot;d&quot; // 0x00000200 Subfile was deleted &quot;D&quot; // 0x00000400 Self was deleted &quot;M&quot; // 0x00000800 Self was moved ; pfd.events = POLLIN; while (!signalled && poll(&pfd, 1, -1) > 0) { ssize_t len; void *buf; struct inotify_event *ie; // read out all pending events xioctl(pfd.fd, FIONREAD, &len); #define eventbuf bb_common_bufsiz1 ie = buf = (len <= sizeof(eventbuf)) ? eventbuf : xmalloc(len); len = full_read(pfd.fd, buf, len); // process events. N.B. events may vary in length while (len > 0) { int i; char events[12]; char *s = events; unsigned m = ie->mask; for (i = 0; i < 12; ++i, m >>= 1) { if (m & 1) { *s++ = mask_names[i]; } } *s = '\0'; args[1] = events; args[2] = watched[ie->wd]; args[3] = ie->len ? ie->name : NULL; xspawn((char **)args); // next event i = sizeof(struct inotify_event) + ie->len; len -= i; ie = (void*)((char*)ie + i); } if (eventbuf != buf) free(buf); } return EXIT_SUCCESS; }
  • 33.
    Asynchronous I/O Onlyon O_DIRECT struct aiocb { int aio_filedes; /* file descriptor * int aio_lio_opcode; /* operation to perform */ int aio_reqprio; /* request priority offset * volatile void *aio_buf; /* pointer to buffer */ size_t aio_nbytes; /* length of operation */ struct sigevent aio_sigevent; /* signal number and value */ /* internal, private members follow... */ }; int aio_read (struct aiocb *aiocbp); int aio_write (struct aiocb *aiocbp); int aio_error (const struct aiocb *aiocbp); int aio_return (struct aiocb *aiocbp); int aio_cancel (int fd, struct aiocb *aiocbp); int aio_fsync (int op, struct aiocb *aiocbp); int aio_suspend (const struct aiocb * const cblist[], int n, const struct timespec *timeout);