// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * dynamic hugetlb core file
 */

#include <linux/rmap.h>
#include <linux/migrate.h>
#include <linux/memblock.h>
#include <linux/memory_hotplug.h>
#include <linux/swap.h>
#include <linux/dynamic_hugetlb.h>

#include "internal.h"

#if (defined CONFIG_DYNAMIC_HUGETLB) && (!defined __GENKSYMS__)
#define CREATE_TRACE_POINTS
#include <trace/events/dynamic_hugetlb.h>
#endif

bool enable_dhugetlb = false;
DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key);

/*
 * Lock this to prevert any page allocation from percpu pool.
 *
 * Before we lock percpu_pool, must be sure hpool lock is released.
 */
static inline void dhugetlb_percpu_pool_lock_all(struct dhugetlb_pool *hpool)
{
	int i;

	for (i = 0; i < NR_PERCPU_POOL; i++)
		spin_lock_nested(&hpool->percpu_pool[i].lock, i);
}

static inline void dhugetlb_percpu_pool_unlock_all(struct dhugetlb_pool *hpool)
{
	int i;

	for (i = NR_PERCPU_POOL - 1; i >= 0; i--)
		spin_unlock(&hpool->percpu_pool[i].lock);
}

/*
 * Lock all before r/w percpu_pool.
 *
 * Each percpu_pool lock is used to block page allocated/freed by others.
 * The hpool lock is used to block page allocated/freed by percpu_pool.
 *
 * We need to lock all in following situation:
 * a) when merging pages, we have to make sure no one can alloc page from
     each pool.
 * b) when get the accurate pagecount.
 * hpool->lock & all percpu_pool lock must be released before this.
 */
static inline void dhugetlb_lock_all(struct dhugetlb_pool *hpool)
{
	local_irq_disable();
	dhugetlb_percpu_pool_lock_all(hpool);
	spin_lock(&hpool->lock);
}

static inline void dhugetlb_unlock_all(struct dhugetlb_pool *hpool)
{
	lockdep_assert_held(&hpool->lock);

	spin_unlock(&hpool->lock);
	dhugetlb_percpu_pool_unlock_all(hpool);
	local_irq_enable();
}

#define hugepage_index(pfn)	((pfn) >> (PUD_SHIFT - PAGE_SHIFT))
static void add_new_page_to_pool(struct dhugetlb_pool *hpool, struct page *page, int hpages_pool_idx)
{
	struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[hpages_pool_idx];

	lockdep_assert_held(&hpool->lock);
	VM_BUG_ON_PAGE(page_mapcount(page), page);
	INIT_LIST_HEAD(&page->lru);

	switch (hpages_pool_idx) {
		case HUGE_PAGES_POOL_1G:
			prep_compound_gigantic_page(page, PUD_SHIFT - PAGE_SHIFT);
			set_page_count(page, 0);
			set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
			hugetlb_set_page_subpool(page, NULL);
			set_hugetlb_cgroup(page, NULL);
			set_hugetlb_cgroup_rsvd(page, NULL);
			break;
		case HUGE_PAGES_POOL_2M:
			prep_new_page(page, PMD_SHIFT - PAGE_SHIFT, __GFP_COMP, 0);
			set_page_count(page, 0);
			set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
			hugetlb_set_page_subpool(page, NULL);
			set_hugetlb_cgroup(page, NULL);
			set_hugetlb_cgroup_rsvd(page, NULL);
			break;
	}
	page->mapping = NULL;
	list_add_tail(&page->lru, &hpages_pool->hugepage_freelists);
	hpages_pool->free_normal_pages++;
}

static void __hpool_split_gigantic_page(struct dhugetlb_pool *hpool, struct page *page)
{
	int nr_pages = 1 << (PUD_SHIFT - PAGE_SHIFT);
	int nr_blocks = 1 << (PMD_SHIFT - PAGE_SHIFT);
	unsigned long pfn = page_to_pfn(page);
	int i;

	lockdep_assert_held(&hpool->lock);
	atomic_set(compound_mapcount_ptr(page), 0);
	atomic_set(compound_pincount_ptr(page), 0);

	for (i = 1; i < nr_pages; i++)
		clear_compound_head(pfn_to_page(pfn + i));
	set_compound_order(page, 0);
	page[1].compound_nr = 0;
	__ClearPageHead(page);

	for (i = 0; i < nr_pages; i+= nr_blocks)
		add_new_page_to_pool(hpool, pfn_to_page(pfn + i),
				     HUGE_PAGES_POOL_2M);
}

static void __hpool_split_huge_page(struct dhugetlb_pool *hpool, struct page *page)
{
	int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT);
	int i;

	lockdep_assert_held(&hpool->lock);
	set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
	set_compound_order(page, 0);

	__ClearPageHead(page);
	for (i = 0; i < nr_pages; i++) {
		if (i != 0)
			clear_compound_head(&page[i]);
		/*
		 * If a hugepage is mapped in private mode, the PG_uptodate bit
		 * will not be cleared when the hugepage freed. Clear the
		 * hugepage using free_pages_prepare() here.
		 */
		free_pages_prepare(&page[i], 0, false);
		add_new_page_to_pool(hpool, &page[i], HUGE_PAGES_POOL_4K);
	}
}

static int hpool_split_page(struct dhugetlb_pool *hpool, int hpages_pool_idx)
{
	struct huge_pages_pool *hpages_pool;
	struct split_hugepage *split_page;
	struct page *page;

	lockdep_assert_held(&hpool->lock);

	if (hpages_pool_idx < 0 || hpages_pool_idx >= HUGE_PAGES_POOL_MAX - 1)
		return -EINVAL;

	hpages_pool = &hpool->hpages_pool[hpages_pool_idx];

	/* If hpages_pool has no pages to split, try higher hpages_pool */
	if (!hpages_pool->free_normal_pages &&
	    hpool_split_page(hpool, hpages_pool_idx - 1))
		return -ENOMEM;

	split_page = kzalloc(sizeof(struct split_hugepage), GFP_ATOMIC);
	if (!split_page)
		return -ENOMEM;

	page = list_entry(hpages_pool->hugepage_freelists.prev, struct page, lru);
	list_del(&page->lru);
	hpages_pool->free_normal_pages--;

	split_page->start_pfn = page_to_pfn(page);
	list_add(&split_page->head_pages, &hpages_pool->hugepage_splitlists);
	hpages_pool->split_normal_pages++;
	trace_dynamic_hugetlb_split_merge(hpool, page, DHUGETLB_SPLIT, page_size(page));

	switch (hpages_pool_idx) {
		case HUGE_PAGES_POOL_1G:
			__hpool_split_gigantic_page(hpool, page);
			break;
		case HUGE_PAGES_POOL_2M:
			__hpool_split_huge_page(hpool, page);
			break;
	}
	return 0;
}

static int add_pages_to_percpu_pool(struct dhugetlb_pool *hpool,
				    struct percpu_pages_pool *percpu_pool,
				    unsigned long nr_pages)
{
	struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_4K];
	struct page *page, *next;
	int ret, i = 0;

	while (hpages_pool->free_normal_pages < nr_pages) {
		ret = hpool_split_page(hpool, HUGE_PAGES_POOL_2M);
		if (ret)
			break;
	}

	list_for_each_entry_safe(page, next, &hpages_pool->hugepage_freelists, lru) {
		list_del(&page->lru);
		hpages_pool->free_normal_pages--;
		list_add_tail(&page->lru, &percpu_pool->head_page);
		percpu_pool->free_pages++;
		if (++i == nr_pages)
			break;
	}

	if (percpu_pool->free_pages == 0)
		return -ENOMEM;
	return 0;
}

static void reclaim_pages_from_percpu_pool(struct dhugetlb_pool *hpool,
					struct percpu_pages_pool *percpu_pool,
					unsigned long nr_pages)
{
	struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_4K];
	struct page *page, *next;
	int i = 0;

	list_for_each_entry_safe(page, next, &percpu_pool->head_page, lru) {
		list_del(&page->lru);
		percpu_pool->free_pages--;
		list_add(&page->lru, &hpages_pool->hugepage_freelists);
		hpages_pool->free_normal_pages++;
		if (++i == nr_pages)
			break;
	}
}

static int hpool_merge_page(struct dhugetlb_pool *hpool, int hpages_pool_idx, bool force_merge)
{
	struct huge_pages_pool *hpages_pool, *src_hpages_pool;
	struct split_hugepage *split_page, *split_next;
	unsigned long nr_pages, block_size;
	struct page *page, *next, *p;
	struct percpu_pages_pool *percpu_pool;
	bool need_migrate = false, need_initial = false;
	bool tried_migrate, can_merge;
	int i;
	LIST_HEAD(wait_page_list);

	lockdep_assert_held(&hpool->lock);

	if (hpages_pool_idx < 0 || hpages_pool_idx >= HUGE_PAGES_POOL_MAX - 1)
		return -EINVAL;

	switch (hpages_pool_idx) {
		case HUGE_PAGES_POOL_1G:
			nr_pages = 1 << (PUD_SHIFT - PAGE_SHIFT);
			block_size = 1 << (PMD_SHIFT - PAGE_SHIFT);
			need_initial = true;
			break;
		case HUGE_PAGES_POOL_2M:
			nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT);
			block_size = 1;
			need_migrate |= force_merge;
			break;
	}

	hpages_pool = &hpool->hpages_pool[hpages_pool_idx];
	src_hpages_pool = &hpool->hpages_pool[hpages_pool_idx + 1];
	if (!hpages_pool->split_normal_pages)
		return -ENOMEM;

	list_for_each_entry_safe(split_page, split_next, &hpages_pool->hugepage_splitlists, head_pages) {
		tried_migrate = false;

merge:
		can_merge = true;

		spin_unlock_irq(&hpool->lock);
		cond_resched();
		/*
		 * If we are merging 4K page to 2M page, we need to get
		 * lock of percpu pool sequentially and clear percpu pool.
		 */
		if (hpages_pool_idx == HUGE_PAGES_POOL_2M) {
			dhugetlb_lock_all(hpool);
			for (i = 0; i < NR_PERCPU_POOL; i++) {
				percpu_pool = &hpool->percpu_pool[i];
				reclaim_pages_from_percpu_pool(hpool, percpu_pool,
							       percpu_pool->free_pages);
			}
		} else {
			spin_lock_irq(&hpool->lock);
		}

		page = pfn_to_page(split_page->start_pfn);
		for (i = 0; i < nr_pages; i+= block_size) {
			p = pfn_to_page(split_page->start_pfn + i);
			if (PagePool(p)) {
				/*
				 * Some pages still in use, can't merge.
				 * If don't need migration or have tried,
				 * then skip merging these pages.
				 */
				can_merge = false;
				if (!need_migrate || tried_migrate)
					break;
				else
					goto migrate;
			}
		}
		if (hpages_pool_idx == HUGE_PAGES_POOL_2M)
			/*
			 * All target 4K page are in src_hpages_pool, we
			 * can unlock percpu pool.
			 */
			dhugetlb_percpu_pool_unlock_all(hpool);

		if (!can_merge)
			continue;

		list_del(&split_page->head_pages);
		hpages_pool->split_normal_pages--;
		for (i = 0; i < nr_pages; i+= block_size) {
			p = pfn_to_page(split_page->start_pfn + i);
			list_del(&p->lru);
			src_hpages_pool->free_normal_pages--;
			/*
			 * The input of prep_compound_gigantic_page should be a
			 * group of pages whose ref count is 1 rather than
			 * compound_page.
			 * Initialize the pages before merge them to 1G.
			 */
			if (need_initial) {
				int j;

				set_compound_page_dtor(p, NULL_COMPOUND_DTOR);
				atomic_set(compound_mapcount_ptr(p), 0);
				set_compound_order(p, 0);
				__ClearPageHead(p);
				set_page_count(p, 1);
				for (j = 1; j < block_size; j++) {
					clear_compound_head(&p[j]);
					set_page_count(&p[j], 1);
				}
			}
		}
		kfree(split_page);
		add_new_page_to_pool(hpool, page, hpages_pool_idx);
		trace_dynamic_hugetlb_split_merge(hpool, page, DHUGETLB_MERGE, page_size(page));
		return 0;
migrate:
		tried_migrate = true;

		/* Isolate free page first. */
		INIT_LIST_HEAD(&wait_page_list);
		for (i = 0; i < nr_pages; i+= block_size) {
			p = pfn_to_page(split_page->start_pfn + i);
			if (!PagePool(p)) {
				list_move(&p->lru, &wait_page_list);
				src_hpages_pool->free_normal_pages--;
			}
		}

		/* Unlock and try migration. */
		dhugetlb_unlock_all(hpool);

		for (i = 0; i < nr_pages; i+= block_size) {
			p = pfn_to_page(split_page->start_pfn + i);
			if (PagePool(p)) {
				cond_resched();
				lru_add_drain_all();
				/*
				 * TODO: fatal migration failures should bail
				 * out
				 */
				do_migrate_range(page_to_pfn(p), page_to_pfn(p) + block_size);
			}
		}
		spin_lock_irq(&hpool->lock);

		/*
		 * Move all isolate pages to src_hpages_pool and then try
		 * merge again.
		 */
		list_for_each_entry_safe(page, next, &wait_page_list, lru) {
			list_move_tail(&page->lru, &src_hpages_pool->hugepage_freelists);
			src_hpages_pool->free_normal_pages++;
		}
		goto merge;
	}
	return -ENOMEM;
}

static int hugetlb_pool_merge_all_pages(struct dhugetlb_pool *hpool)
{
	struct huge_pages_pool *hpages_pool;
	int ret = 0;

	lockdep_assert_held(&hpool->lock);

	hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_2M];
	while (hpages_pool->split_normal_pages) {
		ret = hpool_merge_page(hpool, HUGE_PAGES_POOL_2M, true);
		if (ret) {
			pr_err("dynamic_hugetlb: %s: merge 4K failed!\n",
				hpool->attach_memcg->css.cgroup->kn->name);
			goto out;
		}
	}
	if (hpages_pool->used_huge_pages || hpages_pool->resv_huge_pages) {
		ret = -ENOMEM;
		pr_err("dynamic_hugetlb: %s: 2M pages in use or resv\n",
			hpool->attach_memcg->css.cgroup->kn->name);
		goto out;
	}
	hpages_pool->free_normal_pages += hpages_pool->nr_huge_pages;
	hpages_pool->nr_huge_pages = 0;
	hpages_pool->free_huge_pages = 0;

	hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G];
	while (hpages_pool->split_normal_pages) {
		ret = hpool_merge_page(hpool, HUGE_PAGES_POOL_1G, true);
		if (ret) {
			pr_err("dynamic_hugetlb: %s: merge 2M failed!\n",
				hpool->attach_memcg->css.cgroup->kn->name);
			goto out;
		}
	}
	if (hpages_pool->used_huge_pages || hpages_pool->resv_huge_pages) {
		ret = -ENOMEM;
		pr_err("dynamic_hugetlb: %s: 1G pages in use or resv\n",
			hpool->attach_memcg->css.cgroup->kn->name);
		goto out;
	}
	hpages_pool->free_normal_pages += hpages_pool->nr_huge_pages;
	hpages_pool->nr_huge_pages = 0;
	hpages_pool->free_huge_pages = 0;

out:
	return ret;
}

static bool get_hpool_unless_zero(struct dhugetlb_pool *hpool)
{
	if (!hpool)
		return false;
	return atomic_inc_not_zero(&hpool->refcnt);
}

static void put_hpool(struct dhugetlb_pool *hpool)
{
	if (!hpool)
		return;
	if (atomic_dec_and_test(&hpool->refcnt)) {
		css_put(&hpool->attach_memcg->css);
		synchronize_rcu();
		kfree(hpool);
	}
}

struct dhugetlb_pagelist {
	unsigned long count;
	struct dhugetlb_pool *hpool[0];
};

static struct dhugetlb_pagelist *dhugetlb_pagelist_t;
static DEFINE_RWLOCK(dhugetlb_pagelist_rwlock);

static struct dhugetlb_pool *get_hpool_from_memcg(struct mem_cgroup *memcg)
{
	struct dhugetlb_pool *hpool;

	rcu_read_lock();
	hpool = memcg->hpool;
	if (!get_hpool_unless_zero(hpool))
		hpool = NULL;
	rcu_read_unlock();

	return hpool;
}

static struct dhugetlb_pool *get_hpool_from_task(struct task_struct *tsk)
{
	struct mem_cgroup *memcg;
	struct dhugetlb_pool *hpool;

	rcu_read_lock();
	memcg = mem_cgroup_from_task(tsk);
	if (!memcg || !css_tryget(&memcg->css)) {
		rcu_read_unlock();
		return NULL;
	}
	rcu_read_unlock();

	hpool = get_hpool_from_memcg(memcg);
	css_put(&memcg->css);
	return hpool;
}

static int set_hpool_in_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool)
{
	/*
	 * There is not conflit when write to dhugetlb_pagelist_t->hpool, so just
	 * need read_lock here.
	 */
	read_lock(&dhugetlb_pagelist_rwlock);

	/*
	 * If page's pfn is greater than dhugetlb_pagelist_t->count (which may
	 * occurs due to memory hotplug) then dhugetlb_pagelist_t need to be
	 * reallocated, so need write_lock here.
	 */
	if (idx >= dhugetlb_pagelist_t->count) {
		unsigned long size;
		struct dhugetlb_pagelist *tmp;

		read_unlock(&dhugetlb_pagelist_rwlock);
		write_lock(&dhugetlb_pagelist_rwlock);

		size = sizeof(struct dhugetlb_pagelist) +
			(idx + 1) * sizeof(struct dhugetlb_pool *);
		tmp = krealloc(dhugetlb_pagelist_t, size, GFP_ATOMIC);
		if (!tmp) {
			write_unlock(&dhugetlb_pagelist_rwlock);
			return -ENOMEM;
		}
		tmp->count = idx + 1;
		dhugetlb_pagelist_t = tmp;

		write_unlock(&dhugetlb_pagelist_rwlock);
		read_lock(&dhugetlb_pagelist_rwlock);
	}
	dhugetlb_pagelist_t->hpool[idx] = hpool;
	read_unlock(&dhugetlb_pagelist_rwlock);

	return 0;
}

static struct dhugetlb_pool *get_hpool_from_pagelist(struct page *page)
{
	unsigned long idx = hugepage_index(page_to_pfn(page));
	struct dhugetlb_pool *hpool = NULL;

	read_lock(&dhugetlb_pagelist_rwlock);
	if (idx < dhugetlb_pagelist_t->count)
		hpool = dhugetlb_pagelist_t->hpool[idx];
	read_unlock(&dhugetlb_pagelist_rwlock);

	if (!get_hpool_unless_zero(hpool))
		return NULL;

	return hpool;
}

bool page_belong_to_dynamic_hugetlb(struct page *page)
{
	struct dhugetlb_pool *hpool;

	if (!dhugetlb_enabled)
		return false;

	hpool = get_hpool_from_pagelist(page);
	put_hpool(hpool);

	return !!hpool;
}

static struct dhugetlb_pool *find_hpool_by_task(struct task_struct *tsk)
{
	struct dhugetlb_pool *hpool;

	if (!dhugetlb_enabled)
		return NULL;

	hpool = get_hpool_from_task(tsk);
	put_hpool(hpool);

	return hpool;
}

int task_has_mem_in_hpool(struct task_struct *tsk)
{
	struct dhugetlb_pool *hpool;

	if (!dhugetlb_enabled)
		return 0;

	hpool = find_hpool_by_task(tsk);

	return hpool ? -EPERM : 0;
}

static bool should_allocate_from_dhugetlb_pool(gfp_t gfp_mask)
{
	gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE;

	if (current->flags & PF_KTHREAD)
		return false;

	/*
	 * The cgroup only charges anonymous and file pages from usespage.
	 * some filesystem maybe has masked out the __GFP_IO | __GFP_FS
	 * to avoid recursive memory request. eg: loop device, xfs.
	 */
	if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE)
		return false;

	return true;
}

static struct page *__alloc_page_from_dhugetlb_pool(void)
{
	struct percpu_pages_pool *percpu_pool;
	struct dhugetlb_pool *hpool;
	struct page *page = NULL;
	unsigned long flags;

	hpool = get_hpool_from_task(current);
	if (!hpool)
		return NULL;

	if (hpool->normal_pages_disabled)
		goto out;
	percpu_pool = &hpool->percpu_pool[smp_processor_id()];
	/*
	 * Before we lock percpu_pool, must be sure hpool is unlocked.
	 */
	spin_lock_irqsave(&percpu_pool->lock, flags);

	do {
		/*
		 * Before discard the bad page, set PagePool flag to
		 * distinguish from free page. And increase used_pages
		 * to guarantee used + freed = total.
		 */
		if (page)
			SetPagePool(page);
		page = NULL;
		if (percpu_pool->free_pages == 0) {
			int ret;

			spin_lock(&hpool->lock);
			ret = add_pages_to_percpu_pool(hpool, percpu_pool,
							PERCPU_POOL_PAGE_BATCH);
			spin_unlock(&hpool->lock);
			if (ret)
				goto unlock;
		}

		page = list_entry(percpu_pool->head_page.next, struct page, lru);
		list_del(&page->lru);
		percpu_pool->free_pages--;
		percpu_pool->used_pages++;
	} while (page && check_new_page(page));
	SetPagePool(page);

unlock:
	spin_unlock_irqrestore(&percpu_pool->lock, flags);
out:
	put_hpool(hpool);
	return page;
}

struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp, unsigned int order,
					   unsigned int flags)
{
	struct page *page = NULL;

	if (!dhugetlb_enabled)
		return NULL;

	if (order != 0)
		return NULL;

	if (should_allocate_from_dhugetlb_pool(gfp))
		page = __alloc_page_from_dhugetlb_pool();

	if (page)
		prep_new_page(page, order, gfp, flags);
	return page;
}

static void __free_page_to_dhugetlb_pool(struct page *page)
{
	struct percpu_pages_pool *percpu_pool;
	struct dhugetlb_pool *hpool;
	unsigned long flags;

	hpool = get_hpool_from_pagelist(page);
	if (!hpool) {
		pr_err("dhugetlb: free error: get hpool failed\n");
		return;
	}

	percpu_pool = &hpool->percpu_pool[smp_processor_id()];
	spin_lock_irqsave(&percpu_pool->lock, flags);

	ClearPagePool(page);
	if (!free_pages_prepare(page, 0, true)) {
		SetPagePool(page);
		goto out;
	}
	list_add(&page->lru, &percpu_pool->head_page);
	percpu_pool->free_pages++;
	percpu_pool->used_pages--;
	if (percpu_pool->free_pages > PERCPU_POOL_PAGE_MAX) {
		spin_lock(&hpool->lock);
		reclaim_pages_from_percpu_pool(hpool, percpu_pool, PERCPU_POOL_PAGE_BATCH);
		spin_unlock(&hpool->lock);
	}
out:
	spin_unlock_irqrestore(&percpu_pool->lock, flags);
	put_hpool(hpool);
}

bool free_page_to_dhugetlb_pool(struct page *page)
{
	if (!dhugetlb_enabled || !PagePool(page))
		return false;

	__free_page_to_dhugetlb_pool(page);
	return true;
}

void free_page_list_to_dhugetlb_pool(struct list_head *list)
{
	struct page *page, *next;

	if (!dhugetlb_enabled)
		return;

	list_for_each_entry_safe(page, next, list, lru) {
		if (PagePool(page)) {
			list_del(&page->lru);
			__free_page_to_dhugetlb_pool(page);
		}
	}
}

void link_hpool(struct hugetlbfs_inode_info *p, struct hstate *h)
{
	unsigned long size;

	if (!dhugetlb_enabled || !p)
		return;

	size = huge_page_size(h);
	if (size == PMD_SIZE || size == PUD_SIZE)
		p->hpool = get_hpool_from_task(current);
	else
		p->hpool = NULL;
}

void unlink_hpool(struct hugetlbfs_inode_info *p)
{
	if (!dhugetlb_enabled || !p)
		return;

	put_hpool(p->hpool);
	p->hpool = NULL;
}

bool file_has_mem_in_hpool(struct hugetlbfs_inode_info *p)
{
	if (!dhugetlb_enabled || !p || !p->hpool)
		return false;
	return true;
}

int dhugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *p)
{
	struct dhugetlb_pool *hpool = p ? p->hpool : NULL;
	struct huge_pages_pool *hpages_pool;
	int ret = -ENOMEM;

	if (!dhugetlb_enabled || !hpool)
		return 0;

	if (delta == 0)
		return 0;

	spin_lock_irq(&hpool->lock);
	if (hstate_is_gigantic(h))
		hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G];
	else
		hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_2M];
	if (delta > 0 && delta <= hpages_pool->free_huge_pages - hpages_pool->resv_huge_pages) {
		hpages_pool->resv_huge_pages += delta;
		ret = 0;
		trace_dynamic_hugetlb_acct_memory(hpool, hpages_pool->resv_huge_pages,
						  DHUGETLB_RESV, huge_page_size(h));
	} else if (delta < 0) {
		hpages_pool->resv_huge_pages -= (unsigned long)(-delta);
		WARN_ON(hpages_pool->resv_huge_pages < 0);
		ret = 0;
		trace_dynamic_hugetlb_acct_memory(hpool, hpages_pool->resv_huge_pages,
						  DHUGETLB_UNRESV, huge_page_size(h));
	}
	spin_unlock_irq(&hpool->lock);

	return ret;
}

struct page *alloc_huge_page_from_dhugetlb_pool(struct hstate *h, struct dhugetlb_pool *hpool,
						bool need_unreserved)
{
	struct huge_pages_pool *hpages_pool;
	struct page *page = NULL;
	unsigned long flags;

	if (!dhugetlb_enabled)
		return NULL;

	spin_lock_irqsave(&hpool->lock, flags);
	if (hstate_is_gigantic(h))
		hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G];
	else
		hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_2M];

	if (hpages_pool->free_huge_pages) {
		page = list_entry(hpages_pool->hugepage_freelists.next, struct page, lru);
		list_del(&page->lru);
		hpages_pool->free_huge_pages--;
		hpages_pool->used_huge_pages++;
		if (need_unreserved) {
			SetHPageRestoreReserve(page);
			hpages_pool->resv_huge_pages--;
			trace_dynamic_hugetlb_acct_memory(hpool, hpages_pool->resv_huge_pages,
							  DHUGETLB_UNRESV, huge_page_size(h));
		}
		trace_dynamic_hugetlb_alloc_free(hpool, page, hpages_pool->free_huge_pages,
						 DHUGETLB_ALLOC, huge_page_size(h));
	}
	if (page) {
		INIT_LIST_HEAD(&page->lru);
		set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
		set_page_refcounted(page);
		SetPagePool(page);
	}
	spin_unlock_irqrestore(&hpool->lock, flags);

	return page;
}

void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve)
{
	struct hstate *h = page_hstate(page);
	struct huge_pages_pool *hpages_pool;
	struct dhugetlb_pool *hpool;
	unsigned long flags;

	hpool = get_hpool_from_pagelist(page);
	if (!hpool) {
		pr_err("dhugetlb: free error: get hpool failed\n");
		return;
	}

	spin_lock_irqsave(&hpool->lock, flags);
	/*
	 * memory_failure will free the hwpoison hugepage, and then try to
	 * dissolve it and free subpage to buddy system. Since the page in
	 * dhugetlb_pool should not free to buudy system, we isolate the
	 * hugepage here directly, and skip the latter dissolution.
	 */
	if (PageHWPoison(page))
		goto out;
	ClearPagePool(page);
	if (hstate_is_gigantic(h))
		hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G];
	else
		hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_2M];

	list_add(&page->lru, &hpages_pool->hugepage_freelists);
	hpages_pool->free_huge_pages++;
	hpages_pool->used_huge_pages--;
	if (restore_reserve) {
		hpages_pool->resv_huge_pages++;
		trace_dynamic_hugetlb_acct_memory(hpool, hpages_pool->resv_huge_pages,
						  DHUGETLB_RESV, huge_page_size(h));
	}
	trace_dynamic_hugetlb_alloc_free(hpool, page, hpages_pool->free_huge_pages,
					 DHUGETLB_FREE, huge_page_size(h));
out:
	spin_unlock_irqrestore(&hpool->lock, flags);
	put_hpool(hpool);
}

static int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool,
				       unsigned long nid, unsigned long nr_pages)
{
	struct hstate *h = size_to_hstate(PUD_SIZE);
	struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G];
	struct page *page, *next;
	unsigned long count = 0, idx;
	int ret = 0;

	if (!h)
		return -ENOMEM;

	spin_lock_irq(&hpool->lock);
	spin_lock(&hugetlb_lock);
	if (h->free_huge_pages_node[nid] - h->resv_huge_pages_node[nid] < nr_pages) {
		ret = -ENOMEM;
		goto out_unlock;
	}

	list_for_each_entry_safe(page, next, &h->hugepage_freelists[nid], lru) {
		idx = hugepage_index(page_to_pfn(page));
		ret = set_hpool_in_dhugetlb_pagelist(idx, hpool);
		if (ret)
			continue;

		ClearHPageFreed(page);
		list_move_tail(&page->lru, &hpages_pool->hugepage_freelists);
		h->free_huge_pages--;
		h->free_huge_pages_node[nid]--;
		hpool->total_huge_pages++;
		hpages_pool->free_normal_pages++;

		if (++count == nr_pages)
			break;
	}

out_unlock:
	spin_unlock(&hugetlb_lock);
	spin_unlock_irq(&hpool->lock);
	return ret;
}

static int free_hugepage_to_hugetlb(struct dhugetlb_pool *hpool)
{
	struct hstate *h = size_to_hstate(PUD_SIZE);
	struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G];
	struct page *page, *next, *p;
	unsigned long pfn, idx;
	unsigned int nr_pages;
	int nid, ret = 0;

	if (!h)
		return ret;

	lockdep_assert_held(&hpool->lock);

	spin_lock(&hugetlb_lock);
	list_for_each_entry_safe(page, next, &hpages_pool->hugepage_freelists, lru) {
		nr_pages = 1 << huge_page_order(h);
		pfn = page_to_pfn(page);
		for (; nr_pages--; pfn++) {
			p = pfn_to_page(pfn);
			p->mapping = NULL;
		}
		set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
		/* compound_nr and mapping are union in page, reset it. */
		set_compound_order(page, PUD_SHIFT - PAGE_SHIFT);
		nid = page_to_nid(page);
		SetHPageFreed(page);
		list_move(&page->lru, &h->hugepage_freelists[nid]);
		hpool->total_huge_pages--;
		hpages_pool->free_normal_pages--;
		h->free_huge_pages++;
		h->free_huge_pages_node[nid]++;

		idx = hugepage_index(page_to_pfn(page));
		ret = set_hpool_in_dhugetlb_pagelist(idx, NULL);
		if (ret)
			break;
	}
	spin_unlock(&hugetlb_lock);
	return ret;
}

void hugetlb_pool_inherit(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
	if (!dhugetlb_enabled || !memcg || !parent)
		return;
	memcg->hpool = parent->hpool;
}

static int hugetlb_pool_create(struct mem_cgroup *memcg, unsigned long nid)
{
	struct dhugetlb_pool *hpool;
	int i;

	if (memcg_has_children(memcg))
		return -EINVAL;

	hpool = kzalloc(sizeof(struct dhugetlb_pool) +
			NR_PERCPU_POOL * sizeof(struct percpu_pages_pool), GFP_KERNEL);
	if (!hpool)
		return -ENOMEM;

	spin_lock_init(&hpool->lock);
	mutex_init(&hpool->reserved_lock);
	hpool->nid = nid;
	atomic_set(&hpool->refcnt, 1);

	for (i = 0; i < HUGE_PAGES_POOL_MAX; i++) {
		INIT_LIST_HEAD(&hpool->hpages_pool[i].hugepage_freelists);
		INIT_LIST_HEAD(&hpool->hpages_pool[i].hugepage_splitlists);
	}
	for (i = 0; i < NR_PERCPU_POOL; i++) {
		spin_lock_init(&hpool->percpu_pool[i].lock);
		INIT_LIST_HEAD(&hpool->percpu_pool[i].head_page);
	}

	hpool->attach_memcg = memcg;
	css_get(&memcg->css);
	memcg->hpool = hpool;

	return 0;
}

int hugetlb_pool_destroy(struct cgroup *cgrp)
{
	struct cgroup_subsys_state *css = cgrp->subsys[memory_cgrp_id];
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct dhugetlb_pool *hpool = memcg ? memcg->hpool : NULL;
	int ret = 0;

	if (!dhugetlb_enabled)
		return 0;

	if (!hpool || hpool->attach_memcg != memcg)
		return 0;

	/*
	 * Even if no process exists in the memory cgroup, some pages may still
	 * be occupied. Release these pages before merging them.
	 */
	mem_cgroup_force_empty(hpool->attach_memcg);

	spin_lock_irq(&hpool->lock);
	ret = hugetlb_pool_merge_all_pages(hpool);
	if (ret) {
		spin_unlock_irq(&hpool->lock);
		return -ENOMEM;
	}
	ret = free_hugepage_to_hugetlb(hpool);
	memcg->hpool = NULL;
	spin_unlock_irq(&hpool->lock);
	put_hpool(hpool);
	return ret;
}

static int hugetlb_pool_update(struct mem_cgroup *memcg,
			       unsigned long nid, unsigned long size)
{
	struct dhugetlb_pool *hpool;
	bool new_create = false;
	int ret = -EINVAL;

again:
	hpool = get_hpool_from_memcg(memcg);
	if (!hpool) {
		ret = hugetlb_pool_create(memcg, nid);
		if (ret)
			return ret;
		new_create = true;
		goto again;
	}

	if (hpool->attach_memcg != memcg || hpool->nid != nid)
		goto out;
	ret = alloc_hugepage_from_hugetlb(hpool, nid, size);
	/*
	 * if create a new hpool here but alloc hugepages failed,
	 * destroy it directly here.
	 */
	if (ret && new_create) {
		memcg->hpool = NULL;
		put_hpool(hpool);
	}
out:
	put_hpool(hpool);
	return ret;
}

bool dhugetlb_hide_files(struct cftype *cft)
{
	if (!dhugetlb_enabled && strstr(cft->name, "dhugetlb"))
		return true;
	return false;
}

static ssize_t update_reserved_pages(struct mem_cgroup *memcg, char *buf, int hpages_pool_idx)
{
	struct dhugetlb_pool *hpool;
	struct huge_pages_pool *hpages_pool;
	unsigned long nr_pages;
	unsigned long delta;
	char *endp;

	if (!dhugetlb_enabled)
		return -EINVAL;

	buf = strstrip(buf);
	nr_pages = memparse(buf, &endp);
	if (*endp != '\0')
		return -EINVAL;

	hpool = get_hpool_from_memcg(memcg);
	if (!hpool)
		return -EINVAL;

	mutex_lock(&hpool->reserved_lock);
	spin_lock_irq(&hpool->lock);
	hpages_pool = &hpool->hpages_pool[hpages_pool_idx];
	if (nr_pages > hpages_pool->nr_huge_pages) {
		delta = nr_pages - hpages_pool->nr_huge_pages;
		while (delta > hpages_pool->free_normal_pages) {
			if (hpool_split_page(hpool, hpages_pool_idx - 1))
				break;
		}
		/* Currently, only merging 2M hugepages is supported */
		if (hpages_pool_idx == HUGE_PAGES_POOL_2M) {
			/*
			 * First try to merge pages without migration, If this can not meet
			 * the requirements, then try to merge pages with migration.
			 */
			while (delta > hpages_pool->free_normal_pages) {
				if (hpool_merge_page(hpool, hpages_pool_idx, false))
					break;
			}
			while (delta > hpages_pool->free_normal_pages) {
				if (hpool_merge_page(hpool, hpages_pool_idx, true))
					break;
			}
		}
		delta = min(nr_pages - hpages_pool->nr_huge_pages, hpages_pool->free_normal_pages);
		hpages_pool->nr_huge_pages += delta;
		hpages_pool->free_huge_pages += delta;
		hpages_pool->free_normal_pages -= delta;
	} else {
		delta = min(hpages_pool->nr_huge_pages - nr_pages,
			    hpages_pool->free_huge_pages - hpages_pool->resv_huge_pages);
		hpages_pool->nr_huge_pages -= delta;
		hpages_pool->free_huge_pages -= delta;
		hpages_pool->free_normal_pages += delta;
	}
	spin_unlock_irq(&hpool->lock);
	mutex_unlock(&hpool->reserved_lock);
	put_hpool(hpool);
	return 0;
}

ssize_t write_2M_reserved_pages(struct kernfs_open_file *of,
				char *buf, size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

	return update_reserved_pages(memcg, buf, HUGE_PAGES_POOL_2M) ?: nbytes;
}

ssize_t write_1G_reserved_pages(struct kernfs_open_file *of,
				char *buf, size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

	return update_reserved_pages(memcg, buf, HUGE_PAGES_POOL_1G) ?: nbytes;
}

int normal_pages_disabled_write(struct cgroup_subsys_state *css,
			       struct cftype *cft, u64 val)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct dhugetlb_pool *hpool;

	if (!dhugetlb_enabled)
		return -EINVAL;
	if (!((val == 0) || (val == 1)))
		return -EINVAL;

	hpool = get_hpool_from_memcg(memcg);
	if (!hpool)
		return -EINVAL;

	hpool->normal_pages_disabled = val;

	put_hpool(hpool);
	return 0;
}

u64 normal_pages_disabled_read(struct cgroup_subsys_state *css,
			      struct cftype *cft)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct dhugetlb_pool *hpool;
	u64 val;

	if (!dhugetlb_enabled)
		return 0;

	hpool = get_hpool_from_memcg(memcg);
	if (!hpool)
		return 0;

	val = hpool->normal_pages_disabled;

	put_hpool(hpool);
	return val;
}

ssize_t write_hugepage_to_hpool(struct kernfs_open_file *of,
				char *buf, size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	unsigned long nid, size;
	char *endp;
	int ret;

	if (!dhugetlb_enabled || !memcg)
		return -EINVAL;

	buf = strstrip(buf);
	nid = memparse(buf, &endp);
	if (*endp != ' ' || nid < 0 || nid >= MAX_NUMNODES)
		return -EINVAL;

	buf = endp + 1;
	size = memparse(buf, &endp);
	if (*endp != '\0' || size == 0)
		return -EINVAL;

	ret = hugetlb_pool_update(memcg, nid, size);

	return ret ? : nbytes;
}

int hugetlb_pool_info_show(struct seq_file *m, void *v)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	struct dhugetlb_pool *hpool;
	unsigned long free_pages = 0, used_pages = 0;
	int i;

	if (!dhugetlb_enabled)
		return 0;

	hpool = get_hpool_from_memcg(memcg);
	if (!hpool) {
		seq_printf(m, "Current hierarchial have not memory pool.\n");
		return 0;
	}

	dhugetlb_lock_all(hpool);

	free_pages = hpool->hpages_pool[HUGE_PAGES_POOL_4K].free_normal_pages;
	for (i = 0; i < NR_PERCPU_POOL; i++) {
		free_pages += hpool->percpu_pool[i].free_pages;
		used_pages += hpool->percpu_pool[i].used_pages;
	}

	seq_printf(m,
		   "dhugetlb_total_pages %ld\n"
		   "1G_total_reserved_pages %ld\n"
		   "1G_free_reserved_pages %ld\n"
		   "1G_mmap_reserved_pages %ld\n"
		   "1G_used_pages %ld\n"
		   "2M_total_reserved_pages %ld\n"
		   "2M_free_reserved_pages %ld\n"
		   "2M_mmap_reserved_pages %ld\n"
		   "2M_used_pages %ld\n"
		   "1G_free_unreserved_pages %ld\n"
		   "2M_free_unreserved_pages %ld\n"
		   "4K_free_pages %ld\n"
		   "4K_used_pages %ld\n",
		   hpool->total_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_1G].nr_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_1G].free_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_1G].resv_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_1G].used_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_2M].nr_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_2M].free_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_2M].resv_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_2M].used_huge_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_1G].free_normal_pages,
		   hpool->hpages_pool[HUGE_PAGES_POOL_2M].free_normal_pages,
		   free_pages,
		   used_pages);

	dhugetlb_unlock_all(hpool);
	put_hpool(hpool);
	return 0;
}

#define	DEFAULT_PAGELIST_COUNT	4096
void __init dynamic_hugetlb_init(void)
{
	unsigned long count, size;

	if (!enable_dhugetlb)
		return;

	count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT);
	size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *);
	dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL);
	if (!dhugetlb_pagelist_t) {
		pr_info("Dynamic hugetlb init failed, need %lu memory\n", size);
		return;
	}

	dhugetlb_pagelist_t->count = count;
	static_branch_enable(&dhugetlb_enabled_key);
	pr_info("Dynamic hugetlb is enabled\n");
}

static int __init dynamic_hugetlb_setup(char *s)
{
	if (!s)
		return 0;

	if (!strcmp(s, "on"))
		enable_dhugetlb = true;
	return 0;
}
early_param("dynamic_hugetlb", dynamic_hugetlb_setup);