sock_ Analysis of alloc principle

In the blog "analysis of the principle of linux kernel protocol stack - tcp server-side send process", it is mentioned in sys_ socket=>sock_ Sock_ is invoked in create. Alloc function, but I didn't explain what it did.

Now let's briefly explain the principle. Mainly related to the file system.

struct socket *sock_alloc(void)
{
	struct inode * inode;
	struct socket * sock;

	inode = get_empty_inode();//Cache get free inode
	if (!inode)
		return NULL;

	inode->i_sb = sock_mnt->mnt_sb;//Super block pointing to socket file system
	sock = socki_lookup(inode);

	inode->i_mode = S_IFSOCK|S_IRWXUGO;
	inode->i_sock = 1;
	inode->i_uid = current->fsuid;
	inode->i_gid = current->fsgid;

	sock->inode = inode;//sock reference inode
	init_waitqueue_head(&sock->wait);//Initialize wait queue
	sock->fasync_list = NULL;
	sock->state = SS_UNCONNECTED;//Set to disconnected state
	sock->flags = 0;
	sock->ops = NULL;
	sock->sk = NULL;
	sock->file = NULL;

	sockets_in_use[smp_processor_id()].counter++;
	return sock;
}

There's a sock_ How does MNT come from? It's an object for mounting data structure. It's in the {sock_ Initialized in init function.

init=>do_basic_setup=>sock_init

void __init sock_init(void)
{
......

	register_filesystem(&sock_fs_type);
	sock_mnt = kern_mount(&sock_fs_type);
......
}

#define DECLARE_FSTYPE(var,type,read,flags) \
struct file_system_type var = { \
	name:		type, \
	read_super:	read, \
	fs_flags:	flags, \
	owner:		THIS_MODULE, \
}


static DECLARE_FSTYPE(sock_fs_type, "sockfs", sockfs_read_super,
	FS_NOMOUNT|FS_SINGLE);

//The final replacement result is as follows:
struct file_system_type sock_fs_type= { 
	name:		"sockfs", 
	read_super:	sockfs_read_super, 
	fs_flags:	FS_NOMOUNT|FS_SINGLE, 
	owner:		THIS_MODULE, 
}

To mount the socket file system, you need to register the file system type with the system, that is, socket_ fs_ Type, insert into file_systems global variable;

init=>do_basic_setup=>sock_init=>register_filesystem

int register_filesystem(struct file_system_type * fs)
{
	int res = 0;
	struct file_system_type ** p;

	if (!fs)
		return -EINVAL;
	if (fs->next)
		return -EBUSY;
	write_lock(&file_systems_lock);
	p = find_filesystem(fs->name);//Distinguish by file type name. There is already an error message
	if (*p)
		res = -EBUSY;
	else
		*p = fs;
	write_unlock(&file_systems_lock);
	return res;
}

Continue to look at the mount process, which is operated by the kernel itself and cannot be operated by the user:

init=>do_basic_setup=>sock_init=>kern_mount

struct vfsmount *kern_mount(struct file_system_type *type)
{
	kdev_t dev = get_unnamed_dev();
	struct super_block *sb;//Superblock pointer
	struct vfsmount *mnt;//Mount pointer
	if (!dev)
		return ERR_PTR(-EMFILE);
	sb = read_super(dev, NULL, type, 0, NULL, 0);//Read superblock
	if (!sb) {
		put_unnamed_dev(dev);
		return ERR_PTR(-EINVAL);
	}
	mnt = add_vfsmnt(NULL, sb->s_root, NULL);//mount 
	if (!mnt) {
		kill_super(sb, 0);
		return ERR_PTR(-ENOMEM);
	}
	type->kern_mnt = mnt;
	return mnt;
}

init=>do_basic_setup=>sock_init=>kern_mount=>read_super

static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
				       struct file_system_type *type, int flags,
				       void *data, int silent)
{
	struct super_block * s;
	s = get_empty_super();//Assigning superblock objects
	if (!s)
		goto out;
	s->s_dev = dev;
	s->s_bdev = bdev;
	s->s_flags = flags;
	s->s_dirt = 0;
	sema_init(&s->s_vfs_rename_sem,1);
	sema_init(&s->s_nfsd_free_path_sem,1);
	s->s_type = type;
	sema_init(&s->s_dquot.dqio_sem, 1);
	sema_init(&s->s_dquot.dqoff_sem, 1);
	s->s_dquot.flags = 0;
	lock_super(s);
	if (!type->read_super(s, data, silent))//Call sock_ fs_ Sockfs of type_ read_ Super function, read super block
		goto out_fail;
	unlock_super(s);
	/* tell bdcache that we are going to keep this one */
	if (bdev)
		atomic_inc(&bdev->bd_count);
......
	return NULL;
}

init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super

static struct super_operations sockfs_ops = {
	statfs:		sockfs_statfs,
};

static struct super_block * sockfs_read_super(struct super_block *sb, void *data, int silent)
{
	struct inode *root = new_inode(sb);//Create a root node and point to sb
	if (!root)
		return NULL;
	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;//Set directory properties
	root->i_uid = root->i_gid = 0;
	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
	sb->s_blocksize = 1024;
	sb->s_blocksize_bits = 10;
	sb->s_magic = SOCKFS_MAGIC;//Super block magic number
	sb->s_op	= &sockfs_ops;//Set super block jump table
	sb->s_root = d_alloc(NULL, &(const struct qstr) { "socket:", 7, 0 });//Create dentry
	if (!sb->s_root) {
		iput(root);
		return NULL;
	}
	sb->s_root->d_sb = sb;//Each dentry has a pointer to the superblock to prepare for future access
	sb->s_root->d_parent = sb->s_root;//In the socket file system, because it is the root directory, there is no parent directory, pointing to itself
	d_instantiate(sb->s_root, root);//Associate dentry with inode
	return sb;
}

init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super=>d_alloc

struct dentry * d_alloc(struct dentry * parent, const struct qstr *name)
{
	char * str;
	struct dentry *dentry;

	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); //Allocate dentry from cache
	if (!dentry)
		return NULL;

	if (name->len > DNAME_INLINE_LEN-1) {//If name is too long, reallocate memory
		str = kmalloc(NAME_ALLOC_LEN(name->len), GFP_KERNEL);
		if (!str) {
			kmem_cache_free(dentry_cache, dentry); 
			return NULL;
		}
	} else
		str = dentry->d_iname; 

	memcpy(str, name->name, name->len);//Copy name to dentry - > d_ Iname
	str[name->len] = 0;

	atomic_set(&dentry->d_count, 1);//Set reference count to 1
	dentry->d_flags = 0;
	dentry->d_inode = NULL;
	dentry->d_parent = NULL;
	dentry->d_sb = NULL;
	dentry->d_name.name = str;
	dentry->d_name.len = name->len;
	dentry->d_name.hash = name->hash;
	dentry->d_op = NULL;
	dentry->d_fsdata = NULL;
	INIT_LIST_HEAD(&dentry->d_vfsmnt);//Initialize the mount queue. A directory may be hung multiple times
	INIT_LIST_HEAD(&dentry->d_hash);//Every dentry will be hung into dentry_hashtable hash table
	INIT_LIST_HEAD(&dentry->d_lru);//Dentry may be inserted into the most recently used dentry queue
	INIT_LIST_HEAD(&dentry->d_subdirs);//There may be multiple subdirectories under each directory. The subdirectories pass through d_child hangs into the D of the parent directory_ In subdirs
	INIT_LIST_HEAD(&dentry->d_alias);//An inode may correspond to multiple dentries, which pass through d_alias is attached to the I of inode_ In dentry
	if (parent) {
		dentry->d_parent = dget(parent);
		dentry->d_sb = parent->d_sb;
		spin_lock(&dcache_lock);
		list_add(&dentry->d_child, &parent->d_subdirs);
		spin_unlock(&dcache_lock);
	} else
		INIT_LIST_HEAD(&dentry->d_child);

	dentry_stat.nr_dentry++;
	return dentry;
}

init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super=>d_instantiate

void d_instantiate(struct dentry *entry, struct inode * inode)
{
	spin_lock(&dcache_lock);
	if (inode)
		list_add(&entry->d_alias, &inode->i_dentry);//Chain the entry into the inode, because a file has only one inode, but there may be multiple aliases,
	entry->d_inode = inode;//Point the inode of the entry to the inode so that it can be accessed later. If you are looking for a file,
	spin_unlock(&dcache_lock);
}

After reading the super block, let's take a look at the mounting process:

static struct vfsmount *add_vfsmnt(struct nameidata *nd,
				struct dentry *root,
				const char *dev_name)
{
	struct vfsmount *mnt;
	struct super_block *sb = root->d_inode->i_sb;//Get the superblock object
	char *name;
    
	mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);//Assign hanging objects
	if (!mnt)
		goto out;
	memset(mnt, 0, sizeof(struct vfsmount));

	if (nd || dev_name)
		mnt->mnt_flags = MNT_VISIBLE;

	/* It may be NULL, but who cares? */
	if (dev_name) {
		name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
		if (name) {
			strcpy(name, dev_name);
			mnt->mnt_devname = name;
		}
	}
	mnt->mnt_owner = current->uid;
	atomic_set(&mnt->mnt_count,1);//Set reference count to 1
	mnt->mnt_sb = sb;//Save the address of the super block object, that is, we are in the sock_ Sock seen by alloc_ mnt->mnt_ sb

	spin_lock(&dcache_lock);
	if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
		goto fail;
	mnt->mnt_root = dget(root);//Set mount point root entry
	mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);//The directory entry of the mount point is also the root directory entry
	mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;//The parent directory entry of the mount point is also the root directory entry

	if (nd) {
		list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
		list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt);
	} else {
		INIT_LIST_HEAD(&mnt->mnt_child);//If there is a child mount point under the mounted file system, mount the MNT of the parent directory_ In the child linked list
		INIT_LIST_HEAD(&mnt->mnt_clash);
	}
	INIT_LIST_HEAD(&mnt->mnt_mounts);
	list_add(&mnt->mnt_instances, &sb->s_mounts);//Chain mount points into superblocks
	list_add(&mnt->mnt_list, vfsmntlist.prev);
	spin_unlock(&dcache_lock);
out:
	return mnt;
fail:
	spin_unlock(&dcache_lock);
	if (mnt->mnt_devname)
		kfree(mnt->mnt_devname);
	kfree(mnt);
	return NULL;
}

The root directory of "inode:" is created and incorporated into the root directory. Then create a vfmount mount mount object and connect it with sb and root_inode,root_dentry are interrelated.

Then look at the beginning of the code socki_lookup, get the socket object, and save the allocated inode address in the inode of the socket.

struct inode {
......

	union {
		struct minix_inode_info		minix_i;
		struct ext2_inode_info		ext2_i;
......
		struct socket			socket_i;
		struct usbdev_inode_info        usbdev_i;
		void				*generic_ip;
	} u;
};



extern __inline__ struct socket *socki_lookup(struct inode *inode)
{
	return &inode->u.socket_i;//When creating inode nodes, struct socket s are allocated together, so you can access them directly here
}

In sys_socket function, there is a very important place and socket file system is closely related, that is, socket_ map_ To realize it, FD:

static int sock_map_fd(struct socket *sock)
{
	int fd;
	struct qstr this;
	char name[32];

	/*
	 *	Find a file descriptor suitable for return to the user. 
	 */

	fd = get_unused_fd();//Get an idle file descriptor
	if (fd >= 0) {
		struct file *file = get_empty_filp();//Get free file object

		if (!file) {
			put_unused_fd(fd);
			fd = -ENFILE;
			goto out;
		}

		sprintf(name, "[%lu]", sock->inode->i_ino);//This is the file name, which is set by inode number
		this.name = name;
		this.len = strlen(name);
		this.hash = sock->inode->i_ino;

		file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);//Create a directory entry by file name
		if (!file->f_dentry) {
			put_filp(file);
			put_unused_fd(fd);
			fd = -ENOMEM;
			goto out;
		}
		file->f_dentry->d_op = &sockfs_dentry_operations;//Set the function jump table of dentry
		d_add(file->f_dentry, sock->inode);//Associate inode and dentry and hang dentry in dentry_ In hashtable
		file->f_vfsmnt = mntget(sock_mnt);//The current file points to this mount point object

		sock->file = file;
		file->f_op = sock->inode->i_fop = &socket_file_ops;//Set the file jump table. Here we can analyze why the user can send data by calling the write function. The internal final call is Sock - > Ops - > sendmsg. If it is inet protocol family, it is inet_sendmsg
		file->f_mode = 3;
		file->f_flags = O_RDWR;
		file->f_pos = 0;
		fd_install(fd, file);//Associate fd with file
	}

out:
	return fd;
}

sys_socket=>sock_map_fd=>d_add

static __inline__ void d_add(struct dentry * entry, struct inode * inode)
{
	d_instantiate(entry, inode);//Associate inode and dentry
	d_rehash(entry);//Recalculate the hash value of the entry and hang it in dentry_ In hashtable
}

sys_socket=>sock_map_fd=>d_add=>d_rehash

void d_rehash(struct dentry * entry)
{
	struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);//Calculate the hash value and find the corresponding bucket
	spin_lock(&dcache_lock);
	list_add(&entry->d_hash, list);//Hang it into the linked list of the bucket corresponding to the hash table
	spin_unlock(&dcache_lock);
}

sys_socket=>sock_map_fd=>d_add=>d_rehash=>d_hash

This is very ingenious. The address of the dentry of the parent directory is used to calculate the hash value together, which can avoid hash conflict, such as/ zhangsan/project/src ， ./ lisi/project/src, such as the project directory, is easy to be flushed if only the current directory or file name is used as the key to calculate the hash. However, if the name of the parent directory is used as the hash value, it is also easy to conflict, such as SRC, so the kernel uses the address of the parent dentry of the dentry as part of the hash value, The probability of conflict in SRC theory is greatly reduced.

static inline struct list_head * d_hash(struct dentry * parent, unsigned long hash)
{
	hash += (unsigned long) parent / L1_CACHE_BYTES;
	hash = hash ^ (hash >> D_HASHBITS) ^ (hash >> D_HASHBITS*2);
	return dentry_hashtable + (hash & D_HASHMASK);
}

sockfs_ dentry_ If most operations are empty, follow the system default process

static struct dentry_operations sockfs_dentry_operations = {
	d_delete:	sockfs_delete_dentry,
};

socket_file_ops:

static struct file_operations socket_file_ops = {
	llseek:		sock_lseek,
	read:		sock_read,
	write:		sock_write,
	poll:		sock_poll,
	ioctl:		sock_ioctl,
	mmap:		sock_mmap,
	open:		sock_no_open,	/* special open code to disallow open via /proc */
	release:	sock_close,
	fasync:		sock_fasync,
	readv:		sock_readv,
	writev:		sock_writev
};

Let's take write as an example to see if it has the same effect as send ing data in our user state.

asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count)
{
	ssize_t ret;
	struct file * file;

	ret = -EBADF;
	file = fget(fd);
	if (file) {
		if (file->f_mode & FMODE_WRITE) {
			struct inode *inode = file->f_dentry->d_inode;//Find the corresponding inode node
			ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file,
				file->f_pos, count);
			if (!ret) {
				ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
				ret = -EINVAL;
				if (file->f_op && (write = file->f_op->write) != NULL)
					ret = write(file, buf, count, &file->f_pos);//Obviously, this is sock_write
			}
		}
		if (ret > 0)
			inode_dir_notify(file->f_dentry->d_parent->d_inode,
				DN_MODIFY);
		fput(file);
	}
	return ret;
}

sys_write=>sock_write

static ssize_t sock_write(struct file *file, const char *ubuf,
			  size_t size, loff_t *ppos)
{
	struct socket *sock;
	struct msghdr msg;
	struct iovec iov;
	
	if (ppos != &file->f_pos)
		return -ESPIPE;
	if(size==0)		/* Match SYS5 behaviour */
		return 0;

	sock = socki_lookup(file->f_dentry->d_inode); //Find the socket object through inode, which we saw earlier

	msg.msg_name=NULL;
	msg.msg_namelen=0;
	msg.msg_iov=&iov;
	msg.msg_iovlen=1;
	msg.msg_control=NULL;
	msg.msg_controllen=0;
	msg.msg_flags=!(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
	if (sock->type == SOCK_SEQPACKET)
		msg.msg_flags |= MSG_EOR;
	iov.iov_base=(void *)ubuf;
	iov.iov_len=size;
	
	return sock_sendmsg(sock, &msg, size);//send content
}

sys_write=>sock_write=>sock_sendmsg

int sock_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
	int err;
	struct scm_cookie scm;

	err = scm_send(sock, msg, &scm);
	if (err >= 0) {
		err = sock->ops->sendmsg(sock, msg, size, &scm);//We are very familiar with this. inet is called_ stream_ inet of OPS_ Sendmsg function, if it is inet protocol family
		scm_destroy(&scm);
	}
	return err;
}

Here, we know the hanging process of socket file system, and why the user state calls send and write have the same effect. The reason is socket_ Write finally goes to the sending function of the protocol stack used.

Added by metalspawned on Thu, 10 Feb 2022 11:42:36 +0200

Programming VIP

sock_ Analysis of alloc principle

Popular Keywords