e84b8a85创建于 2018年1月13日历史提交
#!/usr/bin/env python
# -*-coding=utf-8-*-

import numpy as np

class Word2vec(object):
    def __init__(self):
        self.wv = {}


    def load_w2v_array(self, path, id_to_word, is_binary=False):
        """

        :param path:
        :param vocab_index2word: vocab index to word
        :param is_binary:
        :return:
        """

        if not is_binary:
            f = open(path, errors="ignore")
            m, n = f.readline().split()
            dim = int(n)

            print("%s words dim : %s"% (m, n ))
            for  i, line in enumerate(f):
                line = line.strip("\n").strip().split(" ")
                word = line[0]
                vec  =[float(v) for v in line[1:]]
                if len(vec)!= dim:
                    continue

                self.wv[word] = vec

        vocab_size = len(id_to_word)
        embedding = []

        bound =  np.sqrt(6.0) / np.sqrt(vocab_size)
        word2vec_oov_count = 0

        for i in range(vocab_size):
            word = id_to_word.get(i)
            if word in self.wv:
                embedding.append(self.wv.get(word))
            else:
                # todo 随机赋值为何?
                word2vec_oov_count += 1
                embedding.append(np.random.uniform(-bound, bound, dim));

        print("word2vec oov count: %d"%(word2vec_oov_count,))
        return np.array(embedding)