e84b8a85创建于 2018年1月13日历史提交
#!/usr/bin/env python
# -*- coding:utf-8 -*-



def load_file(file_name):
    f = open(file_name)
    sent = []
    for line_no, line in enumerate(f):
        line = line.strip("\n")
        if not line:
            yield sent
            sent = []
        else:
            item = line.split("\t")
            sent.append(item)
    f.close()


def papre_char_vec(train_file, dev_file, test_file, out_filename):
    sent_counter = 0
    outf = open(out_filename, 'w')
    for file_name in [train_file, dev_file, test_file]:
        print("papre char vec train data from : %s" % (file_name))
        for sent in load_file(file_name):
            sent_counter += 1
            s = [item[0] for item in sent]
            outf.write(" ".join(s) + "\n")
    print("all sent count %d" % (sent_counter))


def papre_word_vec(train_file, dev_file, test_file, out_filename):
    sent_counter = 0
    outf = open(out_filename, 'w')
    for file_name in [train_file, dev_file, test_file]:

        print("papre word vec train data from : %s" % (file_name))

        for sent in load_file(file_name):
            sent_counter += 1
            word = ""
            words = []
            for item in sent:
                ch = item[0]
                seg_label = item[1]
                if seg_label == "B":
                    word += ch
                elif seg_label == "M":
                    word += ch
                elif seg_label == "S":
                    words.append(ch)
                elif seg_label == "E":
                    word += ch
                    words.append(word)
                    word = ""
                else:
                    raise Exception("%s ignore" % (seg_label))
            outf.write(" ".join(words) + "\n")


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file", required=True, help="train_file path")
    parser.add_argument("--dev_file", required=True, help="dev file path")
    parser.add_argument("--test_file", required=True, help="test file path")
    parser.add_argument("--out_file", required=True, help="out dir for vec path")
    args = parser.parse_args()
    papre_char_vec(args.train_file, args.dev_file, args.test_file, args.out_file)