XV6學習(12)Lab lock: Parallelism/locking

程式碼在github

這一次實驗是要對XV6內部的鎖進行優化,減少鎖爭用,提高系統的性能。

Memory allocator (moderate)

第一個實驗是對XV6內核的記憶體頁面分配器進行改進,改進的策略在前面的章節中也講過了。XV6原本是使用一個空閑頁面鏈表,但是這樣就會導致不同CPU上的kallockfree會產生鎖爭用,記憶體頁面的分配被完全串列化了,降低了系統的性能。

而一個改進策略就是為每個CPU核心分配一個空閑鏈表,kallockfree都在本核心的鏈表上進行,只有噹噹前核心的鏈表為空時才去訪問其他核心的鏈表。通過這種策略就可以減少鎖的爭用,只有當某核心的鏈表為空時才會發生鎖爭用。

首先定義NCPU個kmem結構體,並在kinit函數中對鎖進行初始化。

struct {
  struct spinlock lock;
  struct run *freelist;
  char lock_name[7];
} kmem[NCPU];

void
kinit()
{
  for (int i = 0; i < NCPU; i++) {
    snprintf(kmem[i].lock_name, sizeof(kmem[i].lock_name), "kmem_%d", i);
    initlock(&kmem[i].lock, kmem[i].lock_name);
  }
  freerange(end, (void*)PHYSTOP);
}

對於kfree函數只需要將釋放的頁面插入到當前核心對應鏈表上就行了

void
kfree(void *pa)
{
  ...
  r = (struct run*)pa;

  push_off();
  int id = cpuid();

  acquire(&kmem[id].lock);
  r->next = kmem[id].freelist;
  kmem[id].freelist = r;
  release(&kmem[id].lock);

  pop_off();
}

對於kalloc函數,當在當前核心上申請失敗時,就嘗試從其他核心上獲取頁面。使用快慢指針來找到鏈表的中點,之後將一半的頁面移動到當前核心的鏈表上。

void *
kalloc(void)
{
  struct run *r;

  push_off();
  int id = cpuid();

  acquire(&kmem[id].lock);
  r = kmem[id].freelist;
  if(r) {
    kmem[id].freelist = r->next;
  }
  else {
    // alloc failed, try to steal from other cpu
    int success = 0;
    int i = 0;
    for(i = 0; i < NCPU; i++) {
      if (i == id) continue;
      acquire(&kmem[i].lock);
      struct run *p = kmem[i].freelist;
      if(p) {
        // steal half of memory
        struct run *fp = p; // faster pointer
        struct run *pre = p;
        while (fp && fp->next) {
          fp = fp->next->next;
          pre = p;
          p = p->next;
        }
        kmem[id].freelist = kmem[i].freelist;
        if (p == kmem[i].freelist) {
          // only have one page
          kmem[i].freelist = 0;
        }
        else {
          kmem[i].freelist = p;
          pre->next = 0;
        }
        success = 1;
      }
      release(&kmem[i].lock);
      if (success) {
        r = kmem[id].freelist;
        kmem[id].freelist = r->next;
        break;
      }
    }
  }
  release(&kmem[id].lock);
  pop_off();

  if(r)
    memset((char*)r, 5, PGSIZE); // fill with junk
  return (void*)r;
}

實驗結果如下:

$ kalloctest
start test1
test1 results:
--- lock kmem/bcache stats
lock: kmem_0: #fetch-and-add 0 #acquire() 77186
lock: kmem_1: #fetch-and-add 0 #acquire() 182362
lock: kmem_2: #fetch-and-add 0 #acquire() 173534
lock: bcache_bucket: #fetch-and-add 0 #acquire() 128
lock: bcache_bucket: #fetch-and-add 0 #acquire() 138
lock: bcache_bucket: #fetch-and-add 0 #acquire() 142
lock: bcache_bucket: #fetch-and-add 0 #acquire() 148
lock: bcache_bucket: #fetch-and-add 0 #acquire() 132
lock: bcache_bucket: #fetch-and-add 0 #acquire() 6
lock: bcache_bucket: #fetch-and-add 0 #acquire() 42
lock: bcache_bucket: #fetch-and-add 0 #acquire() 34
lock: bcache_bucket: #fetch-and-add 0 #acquire() 5916
lock: bcache_bucket: #fetch-and-add 0 #acquire() 32
lock: bcache_bucket: #fetch-and-add 0 #acquire() 242
lock: bcache_bucket: #fetch-and-add 0 #acquire() 128
lock: bcache_bucket: #fetch-and-add 0 #acquire() 128
--- top 5 contended locks:
lock: proc: #fetch-and-add 31954 #acquire() 206502
lock: proc: #fetch-and-add 24395 #acquire() 206518
lock: proc: #fetch-and-add 9306 #acquire() 206501
lock: proc: #fetch-and-add 7463 #acquire() 206481
lock: proc: #fetch-and-add 5209 #acquire() 206480
tot= 0
test1 OK
start test2
total free number of pages: 32493 (out of 32768)
.....
test2 OK

Buffer cache (hard)

這個實驗是要對XV6的磁碟緩衝區進行優化。在初始的XV6磁碟緩衝區中是使用一個LRU鏈表來維護的,而這就導致了每次獲取、釋放緩衝區時就要對整個鏈表加鎖,也就是說緩衝區的操作是完全串列進行的。

為了提高並行性能,我們可以用哈希表來代替鏈表,這樣每次獲取和釋放的時候,都只需要對哈希表的一個桶進行加鎖,桶之間的操作就可以並行進行。只有當需要對緩衝區進行驅逐替換時,才需要對整個哈希表加鎖來查找要替換的塊。

使用哈希表就不能使用鏈表來維護LRU資訊,因此需要在buf結構體中添加timestamp域來記錄釋放的事件,同時prev域也不再需要。

struct buf {
  int valid;   // has data been read from disk?
  int disk;    // does disk "own" buf?
  uint dev;
  uint blockno;
  struct sleeplock lock;
  uint refcnt;
  // struct buf *prev; // LRU cache list
  struct buf *next;
  uchar data[BSIZE];

  uint timestamp;
};

brelse函數中對timestamp域進行維護,同時將鏈表的鎖替換為桶級鎖:

void
brelse(struct buf *b)
{
  if(!holdingsleep(&b->lock))
    panic("brelse");

  releasesleep(&b->lock);

  int idx = hash(b->dev, b->blockno);

  acquire(&hashtable[idx].lock);
  b->refcnt--;
  if (b->refcnt == 0) {
    // no one is waiting for it.
    b->timestamp = ticks;
  }
  
  release(&hashtable[idx].lock);
}

定義哈希表的結構體,bcache.lock為表級鎖,而hashtable[i].lock為桶級鎖:

#define NBUCKET 13
#define NBUF (NBUCKET * 3)

struct {
  struct spinlock lock;
  struct buf buf[NBUF];
} bcache;

struct bucket {
  struct spinlock lock;
  struct buf head;
}hashtable[NBUCKET];

int
hash(uint dev, uint blockno)
{
  return blockno % NBUCKET;
}

binit函數中對哈希表進行初始化,將bcache.buf[NBUF]中的塊平均分配給每個桶,記得設置b->blockno使塊的hash與桶相對應,後面需要根據塊來查找對應的桶。

void
binit(void)
{
  struct buf *b;

  initlock(&bcache.lock, "bcache");

  for(b = bcache.buf; b < bcache.buf+NBUF; b++){
    initsleeplock(&b->lock, "buffer");
  }

  b = bcache.buf;
  for (int i = 0; i < NBUCKET; i++) {
    initlock(&hashtable[i].lock, "bcache_bucket");
    for (int j = 0; j < NBUF / NBUCKET; j++) {
      b->blockno = i; // hash(b) should equal to i
      b->next = hashtable[i].head.next;
      hashtable[i].head.next = b;
      b++;
    }
  }
}

之後就是重點bget函數,首先在對應的桶當中查找當前塊是否被快取,如果被快取就直接返回;如果沒有被快取的話,就需要查找一個塊並將其逐出替換。我這裡使用的策略是先在當前桶當中查找,當前桶沒有查找到再去全局數組中查找,這樣的話如果當前桶中有空閑塊就可以避免全局鎖。

在全局數組中查找時,要先加上表級鎖,當找到一個塊之後,就可以根據塊的資訊查找到對應的桶,之後再對該桶加鎖,將塊從桶的鏈表上取下來,釋放鎖,最後再加到當前桶的鏈表上去。

這裡有個小問題就是全局數組中找到一個塊之後,到對該桶加上鎖之間有一個窗口,可能就在這個窗口裡面這個塊就被那個桶對應的本地查找階段用掉了。因此,需要在加上鎖之後判斷是否被用了,如果被用了就要重新查找。

static struct buf*
bget(uint dev, uint blockno)
{
  // printf("dev: %d blockno: %d Status: ", dev, blockno);
  struct buf *b;

  int idx = hash(dev, blockno);
  struct bucket* bucket = hashtable + idx;
  acquire(&bucket->lock);

  // Is the block already cached?
  for(b = bucket->head.next; b != 0; b = b->next){
    if(b->dev == dev && b->blockno == blockno){
      b->refcnt++;
      release(&bucket->lock);
      acquiresleep(&b->lock);
      // printf("Cached %p\n", b);
      return b;
    }
  }

  // Not cached.
  // First try to find in current bucket.
  int min_time = 0x8fffffff;
  struct buf* replace_buf = 0;

  for(b = bucket->head.next; b != 0; b = b->next){
    if(b->refcnt == 0 && b->timestamp < min_time) {
      replace_buf = b;
      min_time = b->timestamp;
    }
  }
  if(replace_buf) {
    // printf("Local %d %p\n", idx, replace_buf);
    goto find;
  }

  // Try to find in other bucket.
  acquire(&bcache.lock);
  refind:
  for(b = bcache.buf; b < bcache.buf + NBUF; b++) {
    if(b->refcnt == 0 && b->timestamp < min_time) {
      replace_buf = b;
      min_time = b->timestamp;
    }
  }
  if (replace_buf) {
    // remove from old bucket
    int ridx = hash(replace_buf->dev, replace_buf->blockno);
    acquire(&hashtable[ridx].lock);
    if(replace_buf->refcnt != 0)  // be used in another bucket's local find between finded and acquire
    {
      release(&hashtable[ridx].lock);
      goto refind;
    }
    struct buf *pre = &hashtable[ridx].head;
    struct buf *p = hashtable[ridx].head.next;
    while (p != replace_buf) {
      pre = pre->next;
      p = p->next;
    }
    pre->next = p->next;
    release(&hashtable[ridx].lock);
    // add to current bucket
    replace_buf->next = hashtable[idx].head.next;
    hashtable[idx].head.next = replace_buf;
    release(&bcache.lock);
    // printf("Global %d -> %d %p\n", ridx, idx, replace_buf);
    goto find;
  }
  else {
    panic("bget: no buffers");
  }

  find:
  replace_buf->dev = dev;
  replace_buf->blockno = blockno;
  replace_buf->valid = 0;
  replace_buf->refcnt = 1;
  release(&bucket->lock);
  acquiresleep(&replace_buf->lock);
  return replace_buf;
}

最後將bpinbunpin的鎖替換為桶級鎖就行了:

void
bpin(struct buf *b) {
  int idx = hash(b->dev, b->blockno);
  acquire(&hashtable[idx].lock);
  b->refcnt++;
  release(&hashtable[idx].lock);
}

void
bunpin(struct buf *b) {
  int idx = hash(b->dev, b->blockno);
  acquire(&hashtable[idx].lock);
  b->refcnt--;
  release(&hashtable[idx].lock);
}

實驗結果如下:

start test0
test0 results:
--- lock kmem/bcache stats
...
lock: bcache_bucket: #fetch-and-add 0 #acquire() 4244
lock: bcache_bucket: #fetch-and-add 0 #acquire() 2246
lock: bcache_bucket: #fetch-and-add 0 #acquire() 4402
lock: bcache_bucket: #fetch-and-add 0 #acquire() 4458
lock: bcache_bucket: #fetch-and-add 0 #acquire() 6450
lock: bcache_bucket: #fetch-and-add 0 #acquire() 6312
lock: bcache_bucket: #fetch-and-add 0 #acquire() 6624
lock: bcache_bucket: #fetch-and-add 0 #acquire() 6634
lock: bcache_bucket: #fetch-and-add 0 #acquire() 12706
lock: bcache_bucket: #fetch-and-add 0 #acquire() 6208
lock: bcache_bucket: #fetch-and-add 0 #acquire() 4360
lock: bcache_bucket: #fetch-and-add 0 #acquire() 4246
lock: bcache_bucket: #fetch-and-add 0 #acquire() 2236
--- top 5 contended locks:
lock: proc: #fetch-and-add 269741 #acquire() 4551132
lock: proc: #fetch-and-add 236112 #acquire() 4551131
lock: proc: #fetch-and-add 186278 #acquire() 4551151
lock: proc: #fetch-and-add 167286 #acquire() 4551164
lock: proc: #fetch-and-add 151922 #acquire() 4551132
tot= 0
test0: OK
start test1
test1 OK