Linux核心的namespace機制分析

2020-06-16 18:09:19

1. Linux核心namespace機制

Linux Namespaces機制提供一種資源隔離方案。PID,IPC,Network等系統資源不再是全域性性的，而是屬於某個特定的Namespace。每個namespace下的資源對於其他namespace下的資源都是透明，不可見的。因此在作業系統層面上看，就會出現多個相同pid的進程。系統中可以同時存在兩個進程號為0,1,2的進程，由於屬於不同的namespace，所以它們之間並不衝突。而在使用者層面上只能看到屬於使用者自己namespace下的資源，例如使用ps命令只能列出自己namespace下的進程。這樣每個namespace看上去就像一個單獨的Linux系統。

2 . Linux核心中namespace結構體

在Linux核心中提供了多個namespace，其中包括fs (mount), uts, network, sysvipc, 等。一個進程可以屬於多個namesapce,既然namespace和進程相關，那麼在task_struct結構體中就會包含和namespace相關聯的變數。在task_struct 結構中有一個指向namespace結構體的指標nsproxy。

struct task_struct {

……..

/* namespaces */

struct nsproxy *nsproxy;

…….

}

再看一下nsproxy是如何定義的，在include/linux/nsproxy.h檔案中，這裡一共定義了5個各自的名稱空間結構體，在該結構體中定義了5個指向各個型別namespace的指標，由於多個進程可以使用同一個namespace，所以nsproxy可以共用使用，count欄位是該結構的參照計數。

/* 'count' is the number of tasks holding a reference.

* The count for each namespace, then, will be the number

* of nsproxies pointing to it, not the number of tasks.

* The nsproxy is shared by tasks which share all namespaces.

* As soon as a single namespace is cloned or unshared, the

* nsproxy is copied

struct nsproxy {

atomic_t count;

struct uts_namespace *uts_ns;

struct ipc_namespace *ipc_ns;

struct mnt_namespace *mnt_ns;

struct pid_namespace *pid_ns_for_children;

struct net *net_ns;

};

(1) UTS名稱空間包含了執行核心的名稱、版本、底層體系結構型別等資訊。UTS是UNIX Timesharing System的簡稱。

(2) 儲存在struct ipc_namespace中的所有與進程間通訊（IPC）有關的資訊。

(3) 已經裝載的檔案系統的檢視，在struct mnt_namespace中給出。

(4) 有關進程ID的資訊，由struct pid_namespace提供。

(5) struct net_ns包含所有網路相關的名稱空間引數。

系統中有一個預設的nsproxy，init_nsproxy，該結構在task初始化是也會被初始化。#define INIT_TASK(tsk)

{

.nsproxy = &init_nsproxy,

}

其中init_nsproxy的定義為：

static struct kmem_cache *nsproxy_cachep;

struct nsproxy init_nsproxy = {

.count = ATOMIC_INIT(1),

.uts_ns = &init_uts_ns,

#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)

.ipc_ns = &init_ipc_ns,

#endif

.mnt_ns = NULL,

.pid_ns_for_children = &init_pid_ns,

#ifdef CONFIG_NET

.net_ns = &init_net,

#endif

};

對於 .mnt_ns 沒有進行初始化，其餘的namespace都進行了系統預設初始。

3. 使用clone建立自己的Namespace

如果要建立自己的名稱空間，可以使用系統呼叫clone(),它在使用者空間的原型為

int clone(int (*fn)(void *), void *child_stack, int flags, void *arg)

這裡fn是函數指標，這個就是指向函數的指標，, child_stack是為子進程分配系統堆疊空間,flags就是標誌用來描述你需要從父進程繼承那些資源， arg就是傳給子進程的引數也就是fn指向的函數引數。下面是flags可以取的值。這裡只關心和namespace相關的引數。

CLONE_FS 子進程與父進程共用相同的檔案系統，包括root、當前目錄、umask

CLONE_NEWNS 當clone需要自己的名稱空間時設定這個標誌，不能同時設定CLONE_NEWS和CLONE_FS。

Clone()函數是在libc庫中定義的一個封裝函數，它負責建立新輕量級進程的堆疊並且呼叫對程式設計者隱藏了clone系統條用。實現clone()系統呼叫的sys_clone()服務例程並沒有fn和arg引數。封裝函數把fn指標存放在子進程堆疊的每個位置處，該位置就是該封裝函數本身返回地址存放的位置。Arg指標正好存放在子進程堆疊中的fn的下面。當封裝函數結束時，CPU從堆疊中取出返回地址，然後執行fn(arg)函數。

/* Prototype for the glibc wrapper function */

#include <sched.h>

int clone(int (*fn)(void *), void *child_stack,

int flags, void *arg, ...

/* pid_t *ptid, struct user_desc *tls, pid_t *ctid */ );

/* Prototype for the raw system call */

long clone(unsigned long flags, void *child_stack,

void *ptid, void *ctid,

struct pt_regs *regs);

我們在Linux核心中看到的實現函數，是經過libc庫進行封裝過的，在Linux核心中的fork.c檔案中，有下面的定義，最終呼叫的都是do_fork()函數。

#ifdef __ARCH_WANT_SYS_CLONE

#ifdef CONFIG_CLONE_BACKWARDS

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,

int __user *, parent_tidptr,

int, tls_val,

int __user *, child_tidptr)

#elif defined(CONFIG_CLONE_BACKWARDS2)

SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,

int __user *, parent_tidptr,

int __user *, child_tidptr,

int, tls_val)

#elif defined(CONFIG_CLONE_BACKWARDS3)

SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,

int, stack_size,

int __user *, parent_tidptr,

int __user *, child_tidptr,

int, tls_val)

#else

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,

int __user *, parent_tidptr,

int __user *, child_tidptr,

int, tls_val)

#endif

{

return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);

}

#endif

3.1 do_fork函數

在clone()函數中呼叫do_fork函數進行真正的處理，在do_fork函數中呼叫copy_process進程處理。

long do_fork(unsigned long clone_flags,

unsigned long stack_start,

unsigned long stack_size,

int __user *parent_tidptr,

int __user *child_tidptr)

{

struct task_struct *p;

int trace = 0;

long nr;

* Determine whether and which event to report to ptracer. When

* called from kernel_thread or CLONE_UNTRACED is explicitly

* requested, no event is reported; otherwise, report if the event

* for the type of forking is enabled.

if (!(clone_flags & CLONE_UNTRACED)) {

if (clone_flags & CLONE_VFORK)

trace = PTRACE_EVENT_VFORK;

else if ((clone_flags & CSIGNAL) != SIGCHLD)

trace = PTRACE_EVENT_CLONE;

else

trace = PTRACE_EVENT_FORK;

if (likely(!ptrace_event_enabled(current, trace)))

trace = 0;

}

p = copy_process(clone_flags, stack_start, stack_size,

child_tidptr, NULL, trace);

* Do this prior waking up the new thread - the thread pointer

* might get invalid after that point, if the thread exits quickly.

if (!IS_ERR(p)) {

struct completion vfork;

struct pid *pid;

trace_sched_process_fork(current, p);

pid = get_task_pid(p, PIDTYPE_PID);

nr = pid_vnr(pid);

if (clone_flags & CLONE_PARENT_SETTID)

put_user(nr, parent_tidptr);

if (clone_flags & CLONE_VFORK) {

p->vfork_done = &vfork;

init_completion(&vfork);

get_task_struct(p);

}

wake_up_new_task(p);

/* forking complete and child started to run, tell ptracer */

if (unlikely(trace))

ptrace_event_pid(trace, pid);

if (clone_flags & CLONE_VFORK) {

if (!wait_for_vfork_done(p, &vfork))

ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);

}

put_pid(pid);

} else {

nr = PTR_ERR(p);

}

return nr;

}

Linux核心的namespace機制分析

熱門文章