In the blog "analysis of the principle of linux kernel protocol stack - tcp server-side send process", it is mentioned in sys_ socket=>sock_ Sock_ is invoked in create. Alloc function, but I didn't explain what it did.
Now let's briefly explain the principle. Mainly related to the file system.
struct socket *sock_alloc(void) { struct inode * inode; struct socket * sock; inode = get_empty_inode();//Cache get free inode if (!inode) return NULL; inode->i_sb = sock_mnt->mnt_sb;//Super block pointing to socket file system sock = socki_lookup(inode); inode->i_mode = S_IFSOCK|S_IRWXUGO; inode->i_sock = 1; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; sock->inode = inode;//sock reference inode init_waitqueue_head(&sock->wait);//Initialize wait queue sock->fasync_list = NULL; sock->state = SS_UNCONNECTED;//Set to disconnected state sock->flags = 0; sock->ops = NULL; sock->sk = NULL; sock->file = NULL; sockets_in_use[smp_processor_id()].counter++; return sock; }
There's a sock_ How does MNT come from? It's an object for mounting data structure. It's in the {sock_ Initialized in init function.
init=>do_basic_setup=>sock_init
void __init sock_init(void) { ...... register_filesystem(&sock_fs_type); sock_mnt = kern_mount(&sock_fs_type); ...... } #define DECLARE_FSTYPE(var,type,read,flags) \ struct file_system_type var = { \ name: type, \ read_super: read, \ fs_flags: flags, \ owner: THIS_MODULE, \ } static DECLARE_FSTYPE(sock_fs_type, "sockfs", sockfs_read_super, FS_NOMOUNT|FS_SINGLE); //The final replacement result is as follows: struct file_system_type sock_fs_type= { name: "sockfs", read_super: sockfs_read_super, fs_flags: FS_NOMOUNT|FS_SINGLE, owner: THIS_MODULE, }
To mount the socket file system, you need to register the file system type with the system, that is, socket_ fs_ Type, insert into file_systems global variable;
init=>do_basic_setup=>sock_init=>register_filesystem
int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (!fs) return -EINVAL; if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name);//Distinguish by file type name. There is already an error message if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; }
Continue to look at the mount process, which is operated by the kernel itself and cannot be operated by the user:
init=>do_basic_setup=>sock_init=>kern_mount
struct vfsmount *kern_mount(struct file_system_type *type) { kdev_t dev = get_unnamed_dev(); struct super_block *sb;//Superblock pointer struct vfsmount *mnt;//Mount pointer if (!dev) return ERR_PTR(-EMFILE); sb = read_super(dev, NULL, type, 0, NULL, 0);//Read superblock if (!sb) { put_unnamed_dev(dev); return ERR_PTR(-EINVAL); } mnt = add_vfsmnt(NULL, sb->s_root, NULL);//mount if (!mnt) { kill_super(sb, 0); return ERR_PTR(-ENOMEM); } type->kern_mnt = mnt; return mnt; }
init=>do_basic_setup=>sock_init=>kern_mount=>read_super
static struct super_block * read_super(kdev_t dev, struct block_device *bdev, struct file_system_type *type, int flags, void *data, int silent) { struct super_block * s; s = get_empty_super();//Assigning superblock objects if (!s) goto out; s->s_dev = dev; s->s_bdev = bdev; s->s_flags = flags; s->s_dirt = 0; sema_init(&s->s_vfs_rename_sem,1); sema_init(&s->s_nfsd_free_path_sem,1); s->s_type = type; sema_init(&s->s_dquot.dqio_sem, 1); sema_init(&s->s_dquot.dqoff_sem, 1); s->s_dquot.flags = 0; lock_super(s); if (!type->read_super(s, data, silent))//Call sock_ fs_ Sockfs of type_ read_ Super function, read super block goto out_fail; unlock_super(s); /* tell bdcache that we are going to keep this one */ if (bdev) atomic_inc(&bdev->bd_count); ...... return NULL; }
init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super
static struct super_operations sockfs_ops = { statfs: sockfs_statfs, }; static struct super_block * sockfs_read_super(struct super_block *sb, void *data, int silent) { struct inode *root = new_inode(sb);//Create a root node and point to sb if (!root) return NULL; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;//Set directory properties root->i_uid = root->i_gid = 0; root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; sb->s_blocksize = 1024; sb->s_blocksize_bits = 10; sb->s_magic = SOCKFS_MAGIC;//Super block magic number sb->s_op = &sockfs_ops;//Set super block jump table sb->s_root = d_alloc(NULL, &(const struct qstr) { "socket:", 7, 0 });//Create dentry if (!sb->s_root) { iput(root); return NULL; } sb->s_root->d_sb = sb;//Each dentry has a pointer to the superblock to prepare for future access sb->s_root->d_parent = sb->s_root;//In the socket file system, because it is the root directory, there is no parent directory, pointing to itself d_instantiate(sb->s_root, root);//Associate dentry with inode return sb; }
init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super=>d_alloc
struct dentry * d_alloc(struct dentry * parent, const struct qstr *name) { char * str; struct dentry *dentry; dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); //Allocate dentry from cache if (!dentry) return NULL; if (name->len > DNAME_INLINE_LEN-1) {//If name is too long, reallocate memory str = kmalloc(NAME_ALLOC_LEN(name->len), GFP_KERNEL); if (!str) { kmem_cache_free(dentry_cache, dentry); return NULL; } } else str = dentry->d_iname; memcpy(str, name->name, name->len);//Copy name to dentry - > d_ Iname str[name->len] = 0; atomic_set(&dentry->d_count, 1);//Set reference count to 1 dentry->d_flags = 0; dentry->d_inode = NULL; dentry->d_parent = NULL; dentry->d_sb = NULL; dentry->d_name.name = str; dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_LIST_HEAD(&dentry->d_vfsmnt);//Initialize the mount queue. A directory may be hung multiple times INIT_LIST_HEAD(&dentry->d_hash);//Every dentry will be hung into dentry_hashtable hash table INIT_LIST_HEAD(&dentry->d_lru);//Dentry may be inserted into the most recently used dentry queue INIT_LIST_HEAD(&dentry->d_subdirs);//There may be multiple subdirectories under each directory. The subdirectories pass through d_child hangs into the D of the parent directory_ In subdirs INIT_LIST_HEAD(&dentry->d_alias);//An inode may correspond to multiple dentries, which pass through d_alias is attached to the I of inode_ In dentry if (parent) { dentry->d_parent = dget(parent); dentry->d_sb = parent->d_sb; spin_lock(&dcache_lock); list_add(&dentry->d_child, &parent->d_subdirs); spin_unlock(&dcache_lock); } else INIT_LIST_HEAD(&dentry->d_child); dentry_stat.nr_dentry++; return dentry; }
init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super=>d_instantiate
void d_instantiate(struct dentry *entry, struct inode * inode) { spin_lock(&dcache_lock); if (inode) list_add(&entry->d_alias, &inode->i_dentry);//Chain the entry into the inode, because a file has only one inode, but there may be multiple aliases, entry->d_inode = inode;//Point the inode of the entry to the inode so that it can be accessed later. If you are looking for a file, spin_unlock(&dcache_lock); }
After reading the super block, let's take a look at the mounting process:
static struct vfsmount *add_vfsmnt(struct nameidata *nd, struct dentry *root, const char *dev_name) { struct vfsmount *mnt; struct super_block *sb = root->d_inode->i_sb;//Get the superblock object char *name; mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);//Assign hanging objects if (!mnt) goto out; memset(mnt, 0, sizeof(struct vfsmount)); if (nd || dev_name) mnt->mnt_flags = MNT_VISIBLE; /* It may be NULL, but who cares? */ if (dev_name) { name = kmalloc(strlen(dev_name)+1, GFP_KERNEL); if (name) { strcpy(name, dev_name); mnt->mnt_devname = name; } } mnt->mnt_owner = current->uid; atomic_set(&mnt->mnt_count,1);//Set reference count to 1 mnt->mnt_sb = sb;//Save the address of the super block object, that is, we are in the sock_ Sock seen by alloc_ mnt->mnt_ sb spin_lock(&dcache_lock); if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry)) goto fail; mnt->mnt_root = dget(root);//Set mount point root entry mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);//The directory entry of the mount point is also the root directory entry mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;//The parent directory entry of the mount point is also the root directory entry if (nd) { list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts); list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt); } else { INIT_LIST_HEAD(&mnt->mnt_child);//If there is a child mount point under the mounted file system, mount the MNT of the parent directory_ In the child linked list INIT_LIST_HEAD(&mnt->mnt_clash); } INIT_LIST_HEAD(&mnt->mnt_mounts); list_add(&mnt->mnt_instances, &sb->s_mounts);//Chain mount points into superblocks list_add(&mnt->mnt_list, vfsmntlist.prev); spin_unlock(&dcache_lock); out: return mnt; fail: spin_unlock(&dcache_lock); if (mnt->mnt_devname) kfree(mnt->mnt_devname); kfree(mnt); return NULL; }
The root directory of "inode:" is created and incorporated into the root directory. Then create a vfmount mount mount object and connect it with sb and root_inode,root_dentry are interrelated.
Then look at the beginning of the code socki_lookup, get the socket object, and save the allocated inode address in the inode of the socket.
struct inode { ...... union { struct minix_inode_info minix_i; struct ext2_inode_info ext2_i; ...... struct socket socket_i; struct usbdev_inode_info usbdev_i; void *generic_ip; } u; }; extern __inline__ struct socket *socki_lookup(struct inode *inode) { return &inode->u.socket_i;//When creating inode nodes, struct socket s are allocated together, so you can access them directly here }
In sys_socket function, there is a very important place and socket file system is closely related, that is, socket_ map_ To realize it, FD:
static int sock_map_fd(struct socket *sock) { int fd; struct qstr this; char name[32]; /* * Find a file descriptor suitable for return to the user. */ fd = get_unused_fd();//Get an idle file descriptor if (fd >= 0) { struct file *file = get_empty_filp();//Get free file object if (!file) { put_unused_fd(fd); fd = -ENFILE; goto out; } sprintf(name, "[%lu]", sock->inode->i_ino);//This is the file name, which is set by inode number this.name = name; this.len = strlen(name); this.hash = sock->inode->i_ino; file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);//Create a directory entry by file name if (!file->f_dentry) { put_filp(file); put_unused_fd(fd); fd = -ENOMEM; goto out; } file->f_dentry->d_op = &sockfs_dentry_operations;//Set the function jump table of dentry d_add(file->f_dentry, sock->inode);//Associate inode and dentry and hang dentry in dentry_ In hashtable file->f_vfsmnt = mntget(sock_mnt);//The current file points to this mount point object sock->file = file; file->f_op = sock->inode->i_fop = &socket_file_ops;//Set the file jump table. Here we can analyze why the user can send data by calling the write function. The internal final call is Sock - > Ops - > sendmsg. If it is inet protocol family, it is inet_sendmsg file->f_mode = 3; file->f_flags = O_RDWR; file->f_pos = 0; fd_install(fd, file);//Associate fd with file } out: return fd; }
sys_socket=>sock_map_fd=>d_add
static __inline__ void d_add(struct dentry * entry, struct inode * inode) { d_instantiate(entry, inode);//Associate inode and dentry d_rehash(entry);//Recalculate the hash value of the entry and hang it in dentry_ In hashtable }
sys_socket=>sock_map_fd=>d_add=>d_rehash
void d_rehash(struct dentry * entry) { struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);//Calculate the hash value and find the corresponding bucket spin_lock(&dcache_lock); list_add(&entry->d_hash, list);//Hang it into the linked list of the bucket corresponding to the hash table spin_unlock(&dcache_lock); }
sys_socket=>sock_map_fd=>d_add=>d_rehash=>d_hash
This is very ingenious. The address of the dentry of the parent directory is used to calculate the hash value together, which can avoid hash conflict, such as/ zhangsan/project/src , ./ lisi/project/src, such as the project directory, is easy to be flushed if only the current directory or file name is used as the key to calculate the hash. However, if the name of the parent directory is used as the hash value, it is also easy to conflict, such as SRC, so the kernel uses the address of the parent dentry of the dentry as part of the hash value, The probability of conflict in SRC theory is greatly reduced.
static inline struct list_head * d_hash(struct dentry * parent, unsigned long hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; hash = hash ^ (hash >> D_HASHBITS) ^ (hash >> D_HASHBITS*2); return dentry_hashtable + (hash & D_HASHMASK); }
sockfs_ dentry_ If most operations are empty, follow the system default process
static struct dentry_operations sockfs_dentry_operations = { d_delete: sockfs_delete_dentry, };
socket_file_ops:
static struct file_operations socket_file_ops = { llseek: sock_lseek, read: sock_read, write: sock_write, poll: sock_poll, ioctl: sock_ioctl, mmap: sock_mmap, open: sock_no_open, /* special open code to disallow open via /proc */ release: sock_close, fasync: sock_fasync, readv: sock_readv, writev: sock_writev };
Let's take write as an example to see if it has the same effect as send ing data in our user state.
asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count) { ssize_t ret; struct file * file; ret = -EBADF; file = fget(fd); if (file) { if (file->f_mode & FMODE_WRITE) { struct inode *inode = file->f_dentry->d_inode;//Find the corresponding inode node ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, file->f_pos, count); if (!ret) { ssize_t (*write)(struct file *, const char *, size_t, loff_t *); ret = -EINVAL; if (file->f_op && (write = file->f_op->write) != NULL) ret = write(file, buf, count, &file->f_pos);//Obviously, this is sock_write } } if (ret > 0) inode_dir_notify(file->f_dentry->d_parent->d_inode, DN_MODIFY); fput(file); } return ret; }
sys_write=>sock_write
static ssize_t sock_write(struct file *file, const char *ubuf, size_t size, loff_t *ppos) { struct socket *sock; struct msghdr msg; struct iovec iov; if (ppos != &file->f_pos) return -ESPIPE; if(size==0) /* Match SYS5 behaviour */ return 0; sock = socki_lookup(file->f_dentry->d_inode); //Find the socket object through inode, which we saw earlier msg.msg_name=NULL; msg.msg_namelen=0; msg.msg_iov=&iov; msg.msg_iovlen=1; msg.msg_control=NULL; msg.msg_controllen=0; msg.msg_flags=!(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; iov.iov_base=(void *)ubuf; iov.iov_len=size; return sock_sendmsg(sock, &msg, size);//send content }
sys_write=>sock_write=>sock_sendmsg
int sock_sendmsg(struct socket *sock, struct msghdr *msg, int size) { int err; struct scm_cookie scm; err = scm_send(sock, msg, &scm); if (err >= 0) { err = sock->ops->sendmsg(sock, msg, size, &scm);//We are very familiar with this. inet is called_ stream_ inet of OPS_ Sendmsg function, if it is inet protocol family scm_destroy(&scm); } return err; }
Here, we know the hanging process of socket file system, and why the user state calls send and write have the same effect. The reason is socket_ Write finally goes to the sending function of the protocol stack used.