=========================================================
ϡ
Linux-3.5/Documentation/filesystems/files.txt Ǥ
Ρ JF ץ < http://linuxjf.sourceforge.jp/ >
  2012/09/04
  Seiji Kaneko < skaneko at mbn dot or dot jp >
ɼ  Masanori Kobayashi <zap03216 at nifty dot ne dot jp >
=========================================================
#File management in the Linux kernel
#-----------------------------------
Linux ͥǤΥե
------------------------------

#This document describes how locking for files (struct file)
#and file descriptor table (struct files) works.
ʸϡե (file ¤) Υåȥեǥץơ
 (ե빽¤) ɤΤ褦ư뤫򵭺ܤΤǤ

#Up until 2.6.12, the file descriptor table has been protected
#with a lock (files->file_lock) and reference count (files->count).
#->file_lock protected accesses to all the file related fields
#of the table. ->count was used for sharing the file descriptor
#table between tasks cloned with CLONE_FILES flag. Typically
#this would be the case for posix threads. As with the common
#refcounting model in the kernel, the last task doing
#a put_files_struct() frees the file descriptor (fd) table.
#The files (struct file) themselves are protected using
#reference count (->f_count).
2.6.12 ޤǤϥեǥץơ֥ϥå (files->file_lock) 
ե󥹥 (files->count) ݸ졢ơ֥Υե˴ؤ
뤹٤ƤΥեɤؤΥ ->file_lock ݸƤޤ
ޤ->count  CLONE_FILES ե饰Ĥʣ줿֤ǤΡե
ǥץơ֥ζͭѤƤޤ̾posix åɤ
ξޤͥΥե󥹥ȶ̥ǥ˱äơ
put_files_struct() Ǹ˹Ԥäեǥץ (fd)
ơ֥ޤե (struct file) Τϡե󥹥
(->f_count) ȤäݸƤޤ

#In the new lock-free model of file descriptor management,
#the reference counting is similar, but the locking is
#based on RCU. The file descriptor table contains multiple
#elements - the fd sets (open_fds and close_on_exec, the
#array of file pointers, the sizes of the sets and the array
#etc.). In order for the updates to appear atomic to
#a lock-free reader, all the elements of the file descriptor
#table are in a separate structure - struct fdtable.
#files_struct contains a pointer to struct fdtable through
#which the actual fd table is accessed. Initially the
#fdtable is embedded in files_struct itself. On a subsequent
#expansion of fdtable, a new fdtable structure is allocated
#and files->fdtab points to the new structure. The fdtable
#structure is freed with RCU and lock-free readers either
#see the old fdtable or the new fdtable making the update
#appear atomic. Here are the locking rules for
#the fdtable structure -
եǥץΥåե꡼ǥǤϡե󥹥
ȤˤϽƱͤǤå RCU ˴ŤΤˤʤ
եǥץơ֥ˤʣ - fd 켰 (open_fd 
close_on_exec, file ݥ󥿤, Ȥο䥵ʤ) - Ǽ
Ƥޤåʤɤ߽Ф¦鹹ȥߥåǤ褦˸
뤿ˡեǥץơ֥γǤϤ٤ΩĤ
¤ - fdtable ¤ - ˤޤfiles_struct ˤ fdtable ¤
ؤΥݥ󥿤Ǽ졢Υݥ󥿷ͳǼºݤ fd ơ֥ؤΥ
ԤޤȤȤ fdtable  files_struct ΤޤƤ
ʹߤ fdtable γĥˡ fdtable ¤ΤƳ졢
files->fdtab ο¤Τؤ褦ˤʤޤfdtable ¤Τϡ
åʤɤ߽ФŤ fdtable ȿ fdtable ξ
ȥߥå˹Ƥ褦˸뤳ȤǤ褦RCU ˤ
ޤʲ fdtable ¤ΤؤΥå§Ǥ

#1. All references to the fdtable must be done through
#   the files_fdtable() macro :
1. fdtable ؤλȤϡɬ files_fdtable() ޥͳǹԤʤʤ
  ޤ

	struct fdtable *fdt;

	rcu_read_lock();

	fdt = files_fdtable(files);
	....
	if (n <= fdt->max_fds)
		....
	...
	rcu_read_unlock();

#   files_fdtable() uses rcu_dereference() macro which takes care of
#   the memory barrier requirements for lock-free dereference.
#   The fdtable pointer must be read within the read-side
#   critical section.
   files_fdtable()  rcu_dereference() ޥȤޤ
   rcu_dereference() ޥåʤλȤԤʤΤɬפʥ
   Хꥢ׵ݤ򸫤ޤfdtable ݥ󥿤ɤ߽Ф¦Υƥ
   ɤ߽ФʤФޤ

#2. Reading of the fdtable as described above must be protected
#   by rcu_read_lock()/rcu_read_unlock().
2. 嵭 fdtable ɤ߽Фϡrcu_read_lock()/rcu_read_unlock() 
  ݸʤФޤ

#3. For any update to the fd table, files->file_lock must
#   be held.
3. fd ơ֥򹹿ˤϡɤΤ褦ʹǤ files->file_lock
  äƤʤФޤ

#4. To look up the file structure given an fd, a reader
#   must use either fcheck() or fcheck_files() APIs. These
#   take care of barrier requirements due to lock-free lookup.
#   An example :
4.  fd Фƥե빽¤Τ򻲾Ȥˤϡɤ¦Ǥ fcheck() 
   fcheck_files() API ȤʤФޤ󡣤 API ϥ
  åפλȤȼХꥢ׵ݤ򸫤Ƥޤ
  ʲ򼨤ޤ

	struct file *file;

	rcu_read_lock();
	file = fcheck(fd);
	if (file) {
		...
	}
	....
	rcu_read_unlock();

#5. Handling of the file structures is special. Since the look-up
#   of the fd (fget()/fget_light()) are lock-free, it is possible
#   that look-up may race with the last put() operation on the
#   file structure. This is avoided using atomic_long_inc_not_zero()
#   on ->f_count :
5. ե빽¤Τˤ̤դɬפǤfd θ
  (fget()/fget_light()) ϥåȤޤ󤫤顢˥ե빽
  ¤ΤؤκǸ put() ȶ򵯤ǽޤ
  ->f_count Ф atomic_long_inc_not_zero() Ѥ뤳Ȥˤ
  Ʋ򤷤ޤ

	rcu_read_lock();
	file = fcheck_files(files, fd);
	if (file) {
		if (atomic_long_inc_not_zero(&file->f_count))
			*fput_needed = 1;
		else
		/* Didn't get the reference, someone's freed */
			file = NULL;
	}
	rcu_read_unlock();
	....
	return file;

#   atomic_long_inc_not_zero() detects if refcounts is already zero or
#   goes to zero during increment. If it does, we fail
#   fget()/fget_light().
   atomic_long_inc_not_zero()  refcount  0 ȤʤäƤ뤳ȡ
   û 0 Ȥʤ뤳Ȥ򸡽ФޤΤ褦ʾ硢
   fget()/fget_light() ϼԤޤ

#6. Since both fdtable and file structures can be looked up
#   lock-free, they must be installed using rcu_assign_pointer()
#   API. If they are looked up lock-free, rcu_dereference()
#   must be used. However it is advisable to use files_fdtable()
#   and fcheck()/fcheck_files() which take care of these issues.
6. fdtable  file ¤ΤΤ⤬åȤʤǻȲǽǤ顢
  ˤĤƤ rcu_assign_pointer() API Ѥƥ󥹥ȡ뤹ɬ
  פޤåȤʤǻȤ硢rcu_dereference() 
  ɬפޤfiles_fdtable()  fcheck()/fcheck_files()
  Ϥݤ򸫤ƤޤΤǡȤΤǤ

#7. While updating, the fdtable pointer must be looked up while
#   holding files->file_lock. If ->file_lock is dropped, then
#   another thread expand the files thereby creating a new
#   fdtable and making the earlier fdtable pointer stale.
#   For example :
7. ˤϡfdtable ݥ󥿤 files->file_lock ä֤ǻ
  ɬפޤ->file_lock äƤʤ硢¾Υåɤ
  Υեĥƿ fdtable  fdtable ݥ
  ̵֤ˤƤޤǽޤ
  ʲ˼ޤ

	spin_lock(&files->file_lock);
	fd = locate_fd(files, file, start);
	if (fd >= 0) {
		/* locate_fd() may have expanded fdtable, load the ptr */
		fdt = files_fdtable(files);
		__set_open_fd(fd, fdt);
		__clear_close_on_exec(fd, fdt);
		spin_unlock(&files->file_lock);
	.....

#   Since locate_fd() can drop ->file_lock (and reacquire ->file_lock),
#   the fdtable pointer (fdt) must be loaded after locate_fd().
   locate_fd()  ->file_lock  (θ ->file_lock Ƽ
   ) ǽ뤿ᡢfdtable ݥ (fdt)  locate_fd()
   ¹Ը˥ɤɬפޤ

