#从文档中收集并创建单词列表 defcreateVocabList(dataset): vocabSet = set([]) for doucument in dataset: vocabSet = vocabSet | set(doucument) return list(vocabSet) #将目标文档转换为词向量的形式 defsetOfwords2Vec(vocabList,inputList): returnVec = [0]*len(vocabList) for word in inputList: if word in vocabList: #returnVec[vocabList.index(word)] = 1 #此处为词袋模型(bag-of-words model) #原式只是记录词是否出现过,但是多次出现所内藏的含义将不能用是否出现替代 #所以此处更新为每次出现,即增加一次 returnVec[vocabList.index(word)] += 1 else: print('the word %s is not in vocabList' % word) return returnVec