使用Glove对词进行向量化表示

心已赠人 2023-08-17 17:44 110阅读 0赞

#%% md
    
    # GloVe: 全局向量 
    
    #%%
    
    # 这些都是我们稍后将要使用的模块.在继续操作之前,请确保可以导入它们
    %matplotlib inline
    from __future__ import print_function
    import collections
    import math
    import numpy as np
    import os
    import random
    import tensorflow as tf
    import bz2
    from matplotlib import pylab
    from six.moves import range
    from six.moves.urllib.request import urlretrieve
    from sklearn.manifold import TSNE
    from sklearn.cluster import KMeans
    from scipy.sparse import lil_matrix
    import nltk # 标准预处理
    import operator # 按值对字典中的对应项进行排序
    #nltk.download() #tokenizers/punkt/PY3/english.pickle
    from math import ceil
    
    #%% md
    
    ## 数据集（Dataset）
    此代码下载[dataset]（http://www.evanjones.ca/software/wikipedia2text.html）
    ，其中包含多篇维基百科文章，总计大约61兆字节.此外，代码确保文件在下载后具有合适的大小.
    
    #%%
    
    url = 'http://www.evanjones.ca/software/'
    
    def maybe_download(filename, expected_bytes):
      """如果不存在，请下载文件，并确保其大小合适"""
      if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
      statinfo = os.stat(filename)
      if statinfo.st_size == expected_bytes:
        print('找到并验证 %s' % filename)
      else:
        print(statinfo.st_size)
        raise Exception(
          '无法验证 ' + filename + '. 你能用浏览器来获取吗?')
      return filename
    
    filename = maybe_download('wikipedia2text-extracted.txt.bz2', 18377035)
    
    #%% md
    
    ## 使用NLTK进行预处理读取数据
    
    将原始数据读取到字符串，转换为小写并使用nltk库对其进行标记.此代码以1MB大小的份数读取数据，
    因为一次处理全文会减慢任务速度;最后返回单词列表.
    
    #%%
    
    def read_data(filename):
      """
      将zip文件中包含的第一个文件解压缩为单词列表，并使用nltk库对其进行预处理
      """
    
      with bz2.BZ2File(filename) as f:
    
        data = []
        file_size = os.stat(filename).st_size
        chunk_size = 1024 * 1024 # 一次读取1M大小的数据
        print('读取数据中...')
        for i in range(ceil(file_size//chunk_size)+1):
            bytes_to_read = min(chunk_size,file_size-(i*chunk_size))
            file_string = f.read(bytes_to_read).decode('utf-8')
            file_string = file_string.lower()
            # 将字符串标记为列表中的单词
            file_string = nltk.word_tokenize(file_string)
            data.extend(file_string)
      return data
    
    words = read_data(filename)
    print('数据大小 %d' % len(words))
    token_count = len(words)
    
    print('示例单词（开始）: ',words[:10])
    print('示例单词（结束）: ',words[-10:])
    
    #%% md
    
    ## 创建字典（Dictionaries）
    
    构建以下内容.为了理解这些元素，我们假设文本内容为“I like to go to school”
    
    * `dictionary`: 将字符串单词映射到ID (例如 {I:0, like:1, to:2, go:3, school:4})
    * `reverse_dictionary`: 将ID映射到字符串单词 (例如 {0:I, 1:like, 2:to, 3:go, 4:school}
    * `count`: （单词，频率）元素列表(例如 [(I,1),(like,1),(to,2),(go,1),(school,1)]
    * `data` : 包含我们读取的文本字符串，其中字符串单词被替换为单词ID(例如[0,1,2,3,2,4])
    
    它还引入了一个额外的特殊标记`UNK`,表示稀有单词太少而无法使用.
    
    #%%
    
    # # 我们将词汇量大小限制为50000
    vocabulary_size = 50000 
    
    def build_dataset(words):
      count = [['UNK', -1]]
      # 仅获取vocabulary_size最常用的单词作为词汇表
      # 所有其他单词将替换为UNK令牌（标记）  
      count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
      dictionary = dict()
    
      # 通过给出字典的当前长度为每个单词创建一个ID,并将该项添加到字典中    
      for word, _ in count:
        dictionary[word] = len(dictionary)
        
      data = list()
      unk_count = 0
      # 遍历我们拥有的所有文本并生成一个列表，其中每个元素对应于在该索引处找到的单词的ID
      for word in words:
        # 如果单词在词典中则使用单词ID，否则使用特殊标记“UNK”的ID
        if word in dictionary:
          index = dictionary[word]
        else:
          index = 0  # 字典(dictionary)['UNK']
          unk_count = unk_count + 1
        data.append(index)
        
      # 使用UNK出现次数来更新COUNT变量统计
      count[0][1] = unk_count
      
      reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
      # 确保字典的大小与词汇量大小相同
      assert len(dictionary) == vocabulary_size
        
      return data, count, dictionary, reverse_dictionary
    
    data, count, dictionary, reverse_dictionary = build_dataset(words)
    print('最常见的词(+UNK)', count[:5])
    print('样本数据', data[:10])
    del words  # 减少内存
    
    #%% md
    
    ## 为GloVe生成批量数据
    生成批处理或目标词 (`批处理`) 和一批相应的上下文词(`标签`)．它一次读取总窗口大小为`2*window_size+1` 的单词 (称为 `span`) ，
    并在单个范围内创建 `2*window_size`数据点。函数以这种方式继续，直到创建`batch_size`数据点.每当我们到达单词序列的末尾时，
    我们就从头开始.
    
    #%%
    
    data_index = 0
    
    def generate_batch(batch_size, window_size):
      # 每次读取数据点时，data_index都会更新1
      global data_index 
        
      # 两个numpy数组来保存目标词（批处理）和上下文词（标签）
      batch = np.ndarray(shape=(batch_size), dtype=np.int32)
      labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
      weights = np.ndarray(shape=(batch_size), dtype=np.float32)
    
      # span定义了总窗口大小，我们在实例中考虑的数据如下所示   
      # [ skip_window target skip_window ]
      span = 2 * window_size + 1 
        
      # 缓冲区保存span中包含的数据
      buffer = collections.deque(maxlen=span)
      
      # 填充缓冲区并更新data_index 
      for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    
      
      # 这是我们为单个目标单词采样的上下文单词的数量 
      num_samples = 2*window_size 
    
      # 我们将批量读取分成两个for循环来进行
      # 内部for循环使用包含span数据填充带有num_samples数据点的批处理和标签
      # 循环输出器对batch_size//num_samples 重复此操作以生成完整批处理  
      for i in range(batch_size // num_samples):
        k=0
        # 避免目标词本身作为预测
        # 填充批处理和标签numpy数组   
        for j in list(range(window_size))+list(range(window_size+1,2*window_size+1)):
          batch[i * num_samples + k] = buffer[window_size]
          labels[i * num_samples + k, 0] = buffer[j]
          weights[i * num_samples + k] = abs(1.0/(j - window_size))
          k += 1 
        
        # 每当我们读取num_samples数据点时，我们已经创建了单个跨度(span)可能的最大数据点数量
        #因此我们需要将跨度(span)移动1以创建新的跨度(span)
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
      return batch, labels, weights
    
    print('data:', [reverse_dictionary[di] for di in data[:8]])
    
    for window_size in [2, 4]:
        data_index = 0
        batch, labels, weights = generate_batch(batch_size=8, window_size=window_size)
        print('\n使用 window_size = %d:' %window_size)
        print('    batch:', [reverse_dictionary[bi] for bi in batch])
        print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
        print('    weights:', [w for w in weights])
    
    #%% md
    
    ## 创建单词共现矩阵
    
    GloVe方法之所以如此引人注目,是因为它在模型中使用了语料库的全局统计.这是通过使用单词共现矩阵中的信息来优化单词向量来实现的．而共现矩阵的x（i,j）项表示单词i在j附近出现的频率.我们会使用加权机制来给相邻单词赋予更大的权重（相比与距离更远的单词而言）．
    
    #%%
    
    # 我们将共现矩阵创建为scipy的压缩稀疏列矩阵
    cooc_data_index = 0
    dataset_size = len(data) 
    skip_window = 4 # 中心词（目标词）左右侧要考虑单词的个数
    
    # 存储单词共现的稀疏矩阵
    cooc_mat = lil_matrix((vocabulary_size, vocabulary_size), dtype=np.float32)
    
    print(cooc_mat.shape)
    def generate_cooc(batch_size,skip_window):
        '''
        通过处理批量数据生成共现矩阵
        '''
        data_index = 0
        print('执行 %d 次迭代去计算共现矩阵'%(dataset_size//batch_size))
        for i in range(dataset_size//batch_size):
            # Printing progress
            if i>0 and i%100000==0:
                print('\t完成 %d 次迭代'%i)
                
            # 生成一批数据
            batch, labels, weights = generate_batch(batch_size, skip_window)
            labels = labels.reshape(-1)
            
            # 相应地增加稀疏矩阵项
            for inp,lbl,w in zip(batch,labels,weights):            
                cooc_mat[inp,lbl] += (1.0*w)
    
    # 生成矩阵
    generate_cooc(8,skip_window)    
    
    # 只打印共现矩阵的某些部分
    print('共现矩阵样本块')
    
    
    # 计算出所选几个词的最高共现率
    for i in range(10):
        idx_target = i
        
        # 获取稀疏矩阵的第i行并使其变得密集
        ith_row = cooc_mat.getrow(idx_target)     
        ith_row_dense = ith_row.toarray('C').reshape(-1)        
        
        # 仅在目标词周围选择合理的词
        while np.sum(ith_row_dense)<10 or np.sum(ith_row_dense)>50000:
            # 随机选择一个单词
            idx_target = np.random.randint(0,vocabulary_size)
            
            # 获取稀疏矩阵的第i行并使其变得密集
            ith_row = cooc_mat.getrow(idx_target) 
            ith_row_dense = ith_row.toarray('C').reshape(-1)    
            
        print('\n目标词: "%s"'%reverse_dictionary[idx_target])
            
        sort_indices = np.argsort(ith_row_dense).reshape(-1) # 最大计数为ith_row_dense的索引
        sort_indices = np.flip(sort_indices,axis=0) # 反转数组（获取最大值到开头）
    
        # 打印几个上下文单词以确保cooc_mat正确
        print('上下文词:',end='')
        for j in range(10):        
            idx_context = sort_indices[j]       
            print('"%s"(id:%d,count:%.2f), '%(reverse_dictionary[idx_context],idx_context,ith_row_dense[idx_context]),
                  end='')
        print()
    
    #%% md
    
    ## GloVe 算法
    
    #%% md
    
    ### 定义超参数
    这里我们定义几个超参数，包括`batch_size`（单个批次中的样本量）`embedding_size`（嵌入向量的大小）`window_size`（上下文窗口大小）.
    
    #%%
    
    batch_size = 128 # 一个batch中的数据点数 
    embedding_size = 128 # 嵌入向量的维数
    window_size = 4 #中心词（目标词）左右侧要考虑单词的个数
    
    # 我们选择一个随机验证集来对最近邻进行采样 
    valid_size = 16 # 用于评估单词之间相似性的随机单词集 
    #  我们从一个大窗口中随机采样有效数据点而不总是采样确定性的数据点 
    valid_window = 50
    
    # 在选择有效样例时，我们会选择一些最常用的单词以及一些很少见的单词
    valid_examples = np.array(random.sample(range(valid_window), valid_size))
    valid_examples = np.append(valid_examples,random.sample(range(1000, 1000+valid_window), valid_size),axis=0)
    
    num_sampled = 32 # 要采样的负示例数 
    
    epsilon = 1 # 损失函数中对数的稳定性
    
    #%% md
    
    ### 定义输入和输出
    
    在这里，我们定义了用于输入和输出训练的占位符（每个大小为`batch_size`），以及一个包含验证示例的常数张量．
    
    #%%
    
    tf.reset_default_graph()
    
    # 训练输入数据（目标单词ID）
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    # 训练输入标签数据（上下文单词ID）
    train_labels = tf.placeholder(tf.int32, shape=[batch_size])
    # 验证输入数据，我们不需要占位符
    # 因为我们已经定义了选择作为用于评估单词向量的验证数据的单词ID
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    #%% md
    
    ### 定义模型参数和其他变量
    我们现在定义四个TensorFlow变量,由输入嵌入层、输入偏差、输出嵌入层和输出偏差组成.
    
    #%%
    
    # 变量
    in_embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0),name='embeddings')
    in_bias_embeddings = tf.Variable(tf.random_uniform([vocabulary_size],0.0,0.01,dtype=tf.float32),name='embeddings_bias')
    
    out_embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0),name='embeddings')
    out_bias_embeddings = tf.Variable(tf.random_uniform([vocabulary_size],0.0,0.01,dtype=tf.float32),name='embeddings_bias')
    
    #%% md
    
    ### 定义模型计算
    
    我们首先定义一个查找函数来为一组给定的输入获取相应的嵌入向量.然后,我们定义一个占位符,它接受给定批数据点（`weights_x`）的权重和
    共同出现的矩阵权重（`x_ij`）.
    ` weights_x`衡量一个数据点对于这两个词共同出现程度的重要性，`x_ij` 表示由数据点中的单词表示的行和列的共现矩阵值.通过这些定义,
    我们可以定义如下所示的损失.有关详细信息，请参阅第4章内容．
    
    #%%
    
    # 查找输入和输出的嵌入
    # 有两个单独的嵌入向量空间用于输入和输出
    embed_in = tf.nn.embedding_lookup(in_embeddings, train_dataset)
    embed_out = tf.nn.embedding_lookup(out_embeddings, train_labels)
    embed_bias_in = tf.nn.embedding_lookup(in_bias_embeddings,train_dataset)
    embed_bias_out = tf.nn.embedding_lookup(out_bias_embeddings,train_labels)
    
    # 损失函数中使用的权重
    weights_x = tf.placeholder(tf.float32,shape=[batch_size],name='weights_x') 
    # 该位置的共现值
    x_ij = tf.placeholder(tf.float32,shape=[batch_size],name='x_ij')
    
    #计算损失,注意
    #我正在一次计算一批样本的损失
    #我们假设了这个偏差,它是一种较小的嵌入类型．
    loss = tf.reduce_mean(
        weights_x * (tf.reduce_sum(embed_in*embed_out,axis=1) + embed_bias_in + embed_bias_out - tf.log(epsilon+x_ij))**2)
    
    
    #%% md
    
    ### 计算单词相似度
    
    我们根据余弦距离计算两个给定单词之间的相似性.为了有效地执行此操作，我们使用矩阵运算来执行此操作，如下所示.
    
    #%%
    
    # 计算小批量（minibatch）示例和所有嵌入之间的相似性.
    # 我们使用余弦距离（cosine distance）函数:
    embeddings = (in_embeddings + out_embeddings)/2.0
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
    
    #%% md
    
    ### 模型参数优化器
    
    然后我们定义一个恒定的学习率和一个使用Adagrad方法的优化器.你也可以随意尝试列出的其他优化器[这里]
    (https://tensorflow.google.cn/api_guides/python/train).
    
    #%%
    
    # 优化器.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
    
    #%% md
    
    ## 运行GloVe 算法
    
    这里我们运行上面定义的结构化GloVe算法.具体来说，我们首先初始化变量，然后对算法进行多步（`num_steps`）训练．
    我们在固定的验证集上每隔几个步长就对算法进行评估，并打印出与给定单词集最接近的单词。
    
    #%%
    
    num_steps = 100001
    glove_loss = []
    
    average_loss = 0
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as session:
        
        tf.global_variables_initializer().run()
        print('初始化')
        
        for step in range(num_steps):
            
            # 生成单批次的（数据、标签、共现权重）
            batch_data, batch_labels, batch_weights = generate_batch(
                batch_size, skip_window) 
            
            # 计算损失函数所需的权重
            batch_weights = [] # 在损失函数中使用的加权
            batch_xij = [] # 在j附近发现i的加权频率
            
            # 计算批次中每个数据点的权重
            for inp,lbl in zip(batch_data,batch_labels.reshape(-1)):     
                point_weight = (cooc_mat[inp,lbl]/100.0)**0.75 if cooc_mat[inp,lbl]<100.0 else 1.0 
                batch_weights.append(point_weight)
                batch_xij.append(cooc_mat[inp,lbl])
            batch_weights = np.clip(batch_weights,-100,1)
            batch_xij = np.asarray(batch_xij)
            
            # 填充feed_dict并运行优化程序（最小化损失）进而计算损失．具体来说,我们提供:
            # train_dataset/train_labels:训练输入和训练标签
            # weights_x: 衡量数据点相对于这两个词共同出现的重要程度
            # x_ij:由数据点中的单词表示的行和列的共同出现矩阵值
            feed_dict = {train_dataset : batch_data.reshape(-1), train_labels : batch_labels.reshape(-1),
                        weights_x:batch_weights,x_ij:batch_xij}
            _, l = session.run([optimizer, loss], feed_dict=feed_dict)
            
            # 更新平均损失变量
            average_loss += l
            if step % 2000 == 0:
              if step > 0:
                average_loss = average_loss / 2000
              #  平均损失是对过去2000批次损失的估计    
              print('第 %d 步长上的平均损失: %f' % (step, average_loss))
              glove_loss.append(average_loss)
              average_loss = 0
            
            # 在这里，我们根据余弦距离为给定验证单词计算top_k最接近的单词
            # 我们对验证集中的所有单词执行此操作
            # 注意：这是一步的计算成本很高      
            if step % 10000 == 0:
              sim = similarity.eval()
              for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = '与 %s 最接近的单词:' % valid_word
                for k in range(top_k):
                  close_word = reverse_dictionary[nearest[k]]
                  log = '%s %s,' % (log, close_word)
                print(log)
                
        final_embeddings = normalized_embeddings.eval()
    
    
    #%%