{"id":1389,"date":"2022-05-22T19:18:43","date_gmt":"2022-05-22T11:18:43","guid":{"rendered":"http:\/\/www.eait.co\/?p=1389"},"modified":"2022-05-22T23:09:11","modified_gmt":"2022-05-22T15:09:11","slug":"nlp%e5%88%9d%e6%ad%a5-%e5%85%b6%e5%ae%83","status":"publish","type":"post","link":"https:\/\/notes.coremix.net\/?p=1389","title":{"rendered":"NLP\u521d\u6b65\u2014\u2014\u5176\u5b83"},"content":{"rendered":"<p>\u548c\u5317\u5927\u5904\u7406\u5de5\u5177\u5dee\u4e0d\u591a<\/p>\n<p>SPacy\u5546\u4e1a\u5f00\u6e90\u8f6f\u4ef6\uff0c\u901f\u5ea6\u6700\u5feb\uff0c\u4f46\u662f\u4e0d\u652f\u6301\u4e2d\u6587<\/p>\n<p><span class=\"md-line md-end-block md-focus\" contenteditable=\"true\"><span class=\"md-expand\">8\u3001Gensim<\/span><\/span><span class=\"md-line md-end-block\" contenteditable=\"true\">\u6587\u672c\u7684\u5411\u91cf\u8868\u793a \u7279\u5f81\u63d0\u53d6\u3002<\/span><\/p>\n<p><span class=\"md-line md-end-block\" contenteditable=\"true\">TF-IDF\u3001word2vec\u3002Bag of Words BOW \u78c1\u5e26\u6a21\u578b<\/span><\/p>\n<p><span class=\"md-line md-end-block\" contenteditable=\"true\">pip install gensim<\/span><\/p>\n<p>spacy.load\uff08&#8217;en&#8217;\uff09 \u7684\u4e00\u4e2abug<\/p>\n<p>python -m spacy install en -i <a href=\"http:\/\/pypi.douban.com\/simple\/\">http:\/\/pypi.douban.com\/simple\/<\/a>\u00a0&#8211;trusted-host\u00a0<a href=\"http:\/\/pypi.douban.com\/\">pypi.douban.com<\/a><\/p>\n<p>\u7136\u540eload\u00a0 en_core_web_sm\u00a0 \u00a0\u5b89\u88c5\u65f6\u6700\u597d\u4e5f\u7528\u8fd9\u4e2a\u4ee3\u66ffen<\/p>\n<p>&nbsp;<\/p>\n<p>\u5176\u5b83\u4ee3\u7801\uff1a<\/p>\n<pre class=\"brush: python; title: ; notranslate\" title=\"\">\r\nimport thulac\r\nimport codecs\r\n\r\ndef ReadFile(filePath,encoding=&quot;utf-8&quot;):\r\n    with codecs.open(filePath,&quot;r&quot;,encoding) as f:\r\n        return f.read()\r\ndef WriteFile(filePath,content,encoding=&quot;gbk&quot;):\r\n    with codecs.open(filePath,&quot;w&quot;,encoding) as f:\r\n        f.write(content)\r\ndef UTF8_2_GBK(src,dst):\r\n    content=ReadFile(src,encoding=&quot;utf-8&quot;)\r\n    WriteFile(dst,content,encoding=&quot;gbk&quot;)\r\nthu1 = thulac.thulac(seg_only=True)  #\u9ed8\u8ba4\u6a21\u5f0f\r\ntext = thu1.cut(&quot;\u6211\u7231\u5317\u4eac\u5929\u5b89\u95e8&quot;, text=True)  #\u8fdb\u884c\u4e00\u53e5\u8bdd\u5206\u8bcd\r\nprint(text)\r\n#2\u6587\u4ef6\u5206\u8bcd\r\nthul_f=thulac.thulac()\r\nUTF8_2_GBK(&quot;input.txt&quot;,&quot;input2.txt&quot;)\r\nUTF8_2_GBK(&quot;output.txt&quot;,&quot;output2.txt&quot;)\r\nthul_f.cut_f(&quot;input2.txt&quot;,&quot;output2.txt&quot;)\r\nprint(&quot;\u6587\u4ef6\u5206\u8bcd\u6210\u529f\uff01&quot;)\r\n<\/pre>\n<p>&nbsp;<\/p>\n<pre class=\"brush: python; title: ; notranslate\" title=\"\">\r\n# -*- coding:utf-8 -*-\r\nfrom gensim import corpora\r\nfrom gensim import models\r\nimport jieba\r\nraw_documents = &#x5B;\r\n    '0\u65e0\u507f\u5c45\u95f4\u4ecb\u7ecd\u4e70\u5356\u6bd2\u54c1\u7684\u884c\u4e3a\u5e94\u5982\u4f55\u5b9a\u6027',\r\n    '1\u5438\u6bd2\u7537\u52a8\u6001\u6301\u6709\u5927\u91cf\u6bd2\u54c1\u7684\u884c\u4e3a\u8be5\u5982\u4f55\u8ba4\u5b9a',\r\n    '2\u5982\u4f55\u533a\u5206\u662f\u975e\u6cd5\u79cd\u690d\u6bd2\u54c1\u539f\u690d\u7269\u7f6a\u8fd8\u662f\u975e\u6cd5\u5236\u9020\u6bd2\u54c1\u7f6a',\r\n    '3\u4e3a\u6bd2\u8d29\u8d29\u5356\u6bd2\u54c1\u63d0\u4f9b\u5e2e\u52a9\u6784\u6210\u8d29\u5356\u6bd2\u54c1\u7f6a',\r\n    '4\u5c06\u81ea\u5df1\u5438\u98df\u7684\u6bd2\u54c1\u539f\u4ef7\u8f6c\u8ba9\u7ed9\u670b\u53cb\u5438\u98df\u7684\u884c\u4e3a\u8be5\u5982\u4f55\u8ba4\u5b9a',\r\n    '5\u4e3a\u83b7\u62a5\u916c\u5e2e\u4eba\u8d2d\u4e70\u6bd2\u54c1\u7684\u884c\u4e3a\u8be5\u5982\u4f55\u8ba4\u5b9a',\r\n    '6\u6bd2\u8d29\u51fa\u72f1\u540e\u518d\u6b21\u591f\u4e70\u6bd2\u54c1\u9014\u4e2d\u88ab\u6293\u7684\u884c\u4e3a\u8ba4\u5b9a',\r\n    '7\u865a\u5938\u6bd2\u54c1\u529f\u6548\u529d\u4eba\u5438\u98df\u6bd2\u54c1\u7684\u884c\u4e3a\u8be5\u5982\u4f55\u8ba4\u5b9a',\r\n    '8\u59bb\u5b50\u4e0b\u843d\u4e0d\u660e\u4e08\u592b\u53c8\u4e0e\u4ed6\u4eba\u767b\u8bb0\u7ed3\u5a5a\u662f\u5426\u4e3a\u65e0\u6548\u5a5a\u59fb',\r\n    '9\u4e00\u65b9\u672a\u7b7e\u5b57\u529e\u7406\u7684\u7ed3\u5a5a\u767b\u8bb0\u662f\u5426\u6709\u6548',\r\n    '10\u592b\u59bb\u53cc\u65b91990\u5e74\u6309\u519c\u6751\u4e60\u4fd7\u4e3e\u529e\u5a5a\u793c\u6ca1\u6709\u7ed3\u5a5a\u8bc1 \u4e00\u65b9\u53ef\u5426\u8d77\u8bc9\u79bb\u5a5a',\r\n    '11\u7ed3\u5a5a\u524d\u5bf9\u65b9\u7236\u6bcd\u51fa\u8d44\u8d2d\u4e70\u7684\u4f4f\u623f\u5199\u6211\u4eec\u4e8c\u4eba\u7684\u540d\u5b57\u6709\u6548\u5417',\r\n    '12\u8eab\u4efd\u8bc1\u88ab\u522b\u4eba\u5192\u7528\u65e0\u6cd5\u767b\u8bb0\u7ed3\u5a5a\u600e\u4e48\u529e\uff1f',\r\n    '13\u540c\u5c45\u540e\u53c8\u4e0e\u4ed6\u4eba\u767b\u8bb0\u7ed3\u5a5a\u662f\u5426\u6784\u6210\u91cd\u5a5a\u7f6a',\r\n    '14\u672a\u529e\u767b\u8bb0\u53ea\u4e3e\u529e\u7ed3\u5a5a\u4eea\u5f0f\u53ef\u8d77\u8bc9\u79bb\u5a5a\u5417',\r\n    '15\u540c\u5c45\u591a\u5e74\u672a\u529e\u7406\u7ed3\u5a5a\u767b\u8bb0\uff0c\u662f\u5426\u53ef\u4ee5\u5411\u6cd5\u9662\u8d77\u8bc9\u8981\u6c42\u79bb\u5a5a'\r\n]\r\ntexts=&#x5B;&#x5B;word for word in jieba.cut(document,cut_all=False)]for document in raw_documents]\r\n# texts=&#x5B;]\r\n# for document in raw_documents:\r\n#     for word in jieba.cut(document):\r\n#         texts.append(word)\r\n# print(texts)\r\n\r\n# \u751f\u6210\u8bcd\u5178\r\ndictionary=corpora.Dictionary(texts)\r\nprint(dictionary)\r\n# \u751f\u6210\u8bed\u6599\r\ncorpus=&#x5B;dictionary.doc2bow(text) for text in texts]\r\nprint(corpus)\r\n#\u751f\u6210tf-idf\u6a21\u578b\r\ntfidf_model=models.TfidfModel(corpus)\r\ncorpus_tfidf=tfidf_model&#x5B;corpus]\r\nfor item in corpus_tfidf:\r\n    print(item)\r\n\r\n\r\n<\/pre>\n<p>&nbsp;<\/p>\n<pre class=\"brush: python; title: ; notranslate\" title=\"\">\r\n# -*- coding:utf-8 -*-\r\nimport spacy\r\nnlp=spacy.load('en')#\u4e0d\u652f\u6301\u4e2d\u6587\r\ntext=&quot;I love coco! 5G is comming.&quot;\r\ntest_words=nlp(text)\r\nprint(8*&quot;*&quot;,&quot;\u5206\u8bcd&quot;,8*&quot;*&quot;)\r\nfor word in test_words:\r\n    print(word)\r\n#2 \u547d\u540d\u5b9e\u4f53\u8bc6\u522b\r\ntext=&quot;It is a beautiful flower\uff01&quot;\r\n#\r\n# test_words=nlp(text)\r\n# for ent in test_words.ents:\r\n#     print(ent,ent.label_,ent.label)\r\nnlp = spacy.load(&quot;en_core_web_sm&quot;)\r\ndoc = nlp(u&quot;Apple is looking at buying U.K. startup for $1 billion&quot;)\r\nfor ent in doc.ents:\r\n    print(ent.text, ent.start_char, ent.end_char, ent.label_)\r\n\r\n<\/pre>\n<p>spacy<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n<p>&nbsp;<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u548c\u5317\u5927\u5904\u7406\u5de5\u5177\u5dee\u4e0d\u591a SPacy\u5546\u4e1a\u5f00\u6e90\u8f6f\u4ef6\uff0c\u901f\u5ea6\u6700\u5feb\uff0c\u4f46\u662f\u4e0d\u652f\u6301\u4e2d\u6587 8\u3001Gensim\u6587\u672c\u7684\u5411\u91cf\u8868\u793a \u7279\u5f81\u63d0 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[20],"class_list":["post-1389","post","type-post","status-publish","format-standard","hentry","category-uncategorized","tag-python"],"blocksy_meta":[],"_links":{"self":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1389","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=1389"}],"version-history":[{"count":4,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1389\/revisions"}],"predecessor-version":[{"id":1400,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=\/wp\/v2\/posts\/1389\/revisions\/1400"}],"wp:attachment":[{"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=1389"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=1389"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/notes.coremix.net\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=1389"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}