1:HL["/_next/static/media/e4af272ccee01ff0-s.p.woff2","font",{"crossOrigin":"","type":"font/woff2"}] 2:HL["/_next/static/css/97132489b96da1d5.css","style",{"crossOrigin":""}] 0:["Y_TW_5cOL4VPb7FuqRz3I",[[["",{"children":[["slug","blog27","d"],{"children":["__PAGE__?{\"slug\":\"blog27\"}",{}]}]},"$undefined","$undefined",true],"$L3",[[["$","link","0",{"rel":"stylesheet","href":"/_next/static/css/97132489b96da1d5.css","precedence":"next","crossOrigin":""}]],"$L4"]]]] 5:HL["/_next/static/css/5b2728e81018a7be.css","style",{"crossOrigin":""}] 6:I[8326,["326","static/chunks/326-ead410bae2047633.js","986","static/chunks/986-f27c5a2c4d841870.js","42","static/chunks/app/%5Bslug%5D/page-419f452b4066bb25.js"],""] 7:I[6954,[],""] 8:I[7264,[],""] 3:[null,["$","html",null,{"lang":"ja","children":["$","body",null,{"className":"__className_f367f3 flex flex-col min-h-screen","children":[["$","header",null,{"className":"blog-header py-5","children":["$","div",null,{"className":"container mx-auto px-4","children":["$","div",null,{"className":"flex flex-col items-start","children":[["$","$L6",null,{"href":"/","className":"hover:no-underline","children":["$","h1",null,{"className":"text-6xl font-serif text-gray-800 mb-2 font-normal","children":"Shingoの数学ノート"}]}],["$","p",null,{"className":"text-xl text-gray-400 font-normal","children":"プログラミングと機械学習のメモ"}]]}]}]}],["$","main",null,{"className":"flex-grow","children":["$","$L7",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","template":["$","$L8",null,{}],"templateStyles":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"childProp":{"current":["$","$L7",null,{"parallelRouterKey":"children","segmentPath":["children",["slug","blog27","d"],"children"],"loading":"$undefined","loadingStyles":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","template":["$","$L8",null,{}],"templateStyles":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","childProp":{"current":["$L9","$La",null],"segment":"__PAGE__?{\"slug\":\"blog27\"}"},"styles":[["$","link","0",{"rel":"stylesheet","href":"/_next/static/css/5b2728e81018a7be.css","precedence":"next","crossOrigin":""}]]}],"segment":["slug","blog27","d"]},"styles":[]}]}],["$","footer",null,{"className":"bg-[#DDDDDD] text-[#999999] py-8 mt-12 text-center border-t border-[#e5e5e5]","children":["$","div",null,{"className":"container mx-auto px-4","children":[["$","p",null,{"className":"mb-2","children":"© All rights reserved by Shingo Sekine."}],["$","p",null,{"children":["$","a",null,{"href":"#","className":"hover:text-blue-500 transition-colors","children":"Back to top"}]}]]}]}]]}]}],null] 4:[["$","meta","0",{"charSet":"utf-8"}],["$","title","1",{"children":"Shingoの数学ノート"}],["$","meta","2",{"name":"description","content":"プログラミング言語と機械学習のメモ"}],["$","meta","3",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","link","4",{"rel":"icon","href":"/favicon.ico","type":"image/x-icon","sizes":"256x256"}],["$","meta","5",{"name":"next-size-adjust"}]] b:I[6180,["326","static/chunks/326-ead410bae2047633.js","986","static/chunks/986-f27c5a2c4d841870.js","42","static/chunks/app/%5Bslug%5D/page-419f452b4066bb25.js"],""] d:I[9703,["326","static/chunks/326-ead410bae2047633.js","986","static/chunks/986-f27c5a2c4d841870.js","42","static/chunks/app/%5Bslug%5D/page-419f452b4066bb25.js"],""] c:T7293,/*@jsxRuntime automatic @jsxImportSource react*/ const {Fragment: _Fragment, jsx: _jsx, jsxs: _jsxs} = arguments[0]; const {useMDXComponents: _provideComponents} = arguments[0]; function _createMdxContent(props) { const _components = Object.assign({ p: "p", a: "a", h3: "h3", ul: "ul", li: "li", img: "img", pre: "pre", code: "code", span: "span" }, _provideComponents(), props.components); return _jsxs(_Fragment, { children: [_jsx(_components.p, { children: "こんにちは!" }), "\n", _jsxs(_components.p, { children: ["今回は、", _jsx(_components.a, { href: "/blog14.html", children: "自然言語処理" }), "の続きで、自然言語処理におけるLDA(トピックモデル)の使い方について説明します。"] }), "\n", _jsx(_components.p, { children: "なお、LDAの内部の理論はとても難しいので、LDAの概要だけ説明します。" }), "\n", _jsx(_components.p, { children: "イメージを掴んで欲しいのが第一のため、正確でない部分も多々ありますが、ご容赦ください。" }), "\n", _jsx(_components.h3, { children: "LDAの概要" }), "\n", _jsx(_components.p, { children: "LDAは教師なし学習の一種で、文書のクラスタリングを行います。" }), "\n", _jsx(_components.p, { children: "LDA のモデリングの際、以下の仮定を置きます。" }), "\n", _jsxs(_components.ul, { children: ["\n", _jsx(_components.li, { children: "文書は複数の潜在トピックから構成され、構成比は離散確率分布で表される。(「文書-トピック分布」と呼ぶ)" }), "\n", _jsx(_components.li, { children: "潜在トピックは単語の出現分布として表される。(「トピック-単語分布」と呼ぶ)" }), "\n", _jsx(_components.li, { children: "文書中の単語は、それぞれ「文書-トピック分布」から生成される潜在トピックをもち、「トピック分布-単語分布」から単語が生成されると仮定する。" }), "\n"] }), "\n", _jsx(_components.p, { children: "何を言っているかよくわからないと思うので、例を説明します。" }), "\n", _jsx(_components.p, { children: "例えば、あなたがニュースを分類しようとします。" }), "\n", _jsx(_components.p, { children: "そこで、あなたはスポーツの記事、政治の記事、音楽の記事などのトピックごとに分類します。" }), "\n", _jsx(_components.p, { children: "この記事はスポーツ記事である確率は60%、政治の記事である確率30%、音楽の記事である確率が10%かな、という感じで分けます。" }), "\n", _jsx(_components.p, { children: "この分布を「文書-トピック分布」と呼びます。" }), "\n", _jsx(_components.p, { children: "ところで、このスポーツ記事、政治記事、音楽記事はどのように決めているのでしょうか。" }), "\n", _jsx(_components.p, { children: "これは、それぞれの記事で特徴となっている単語の量で決定します。" }), "\n", _jsx(_components.p, { children: "スポーツ記事であれば、「野球」、「サッカー」、「テニス」等、 政治記事であれば、「内閣」、「政府」、「政策」等、 音楽記事であれば、「ライブ」、「バンド」、「オリコン」等が出てきやすいのではないでしょうか。" }), "\n", _jsx(_components.p, { children: "これを、それぞれの記事ごとに単語の分布(構成比)として表現します。" }), "\n", _jsx(_components.p, { children: "スポーツ記事は野球10%、サッカー9%、テニス8%みたいな感じです。" }), "\n", _jsx(_components.p, { children: "この分布を「トピック-単語分布」と呼びます。" }), "\n", _jsx(_components.p, { children: _jsx(_components.img, { src: "/images/blog27/lda1.png", alt: "lda1", title: "lda1" }) }), "\n", _jsx(_components.p, { children: "LDAでは、この「文書-トピック分布」と「トピック-単語分布」を求めることをタスクとします。" }), "\n", _jsx(_components.p, { children: "機械では、スポーツ、政治、音楽とラベル付けすることはできないので、「トピック-単語分布」からトピックのラベルを分析者が決定します。" }), "\n", _jsx(_components.p, { children: "クラスタリングをするなら、文書-トピック分布において、最も高い確率のトピックを選択すれば良いです。" }), "\n", _jsx(_components.h3, { children: "PythonでLDAを実装" }), "\n", _jsx(_components.p, { children: "追記20200223:githubにコードを載せました。" }), "\n", _jsx(_components.p, { children: _jsx(_components.a, { href: "https://github.com/Shingo425/NLP/blob/main/src/LDA_sample.ipynb", children: "https://github.com/Shingo425/NLP/blob/main/src/LDA_sample.ipynb" }) }), "\n", _jsx(_components.p, { children: "では実際にLDAを実装してみましょう。" }), "\n", _jsx(_components.p, { children: "必要なモジュールをインポートします。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "import" }), " pandas ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " pd\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " MeCab\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " gensim\n\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " numpy ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " np\n\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " matplotlib.pyplot ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " plt\n"] }) }), "\n", _jsx(_components.p, { children: "テキストデータはライブドアコーパスを使用します。" }), "\n", _jsx(_components.p, { children: "ライブドアコーパスの抽出については以下を参考にしています。" }), "\n", _jsx(_components.p, { children: _jsx(_components.a, { href: "https://nxdataka.netlify.app/ldncsv/", children: "https://nxdataka.netlify.app/ldncsv/" }) }), "\n", _jsx(_components.p, { children: "「livedoornews.csv」をソースコードと同じ場所に置きましょう。" }), "\n", _jsx(_components.p, { children: "こんな感じの中身です。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["livedoor =\npd.read_csv(", _jsx(_components.span, { className: "hljs-string", children: "\"livedoornews.csv\"" }), ")\nlivedoor.head()\n"] }) }), "\n", _jsx(_components.p, { children: _jsx(_components.img, { src: "/images/blog27/lda2.png", alt: "lda2", title: "lda2" }) }), "\n", _jsx(_components.p, { children: "まずは、形態素解析です。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "parse" }), "(", _jsx(_components.span, { className: "hljs-params", children: "tweet_temp" }), "): \n t = MeCab.Tagger()\n temp1 = t.parse(tweet_temp)\n temp2 = temp1.split(", _jsx(_components.span, { className: "hljs-string", children: "\"\\n\"" }), ")\n t_list = []\n ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " keitaiso ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " temp2:\n ", _jsx(_components.span, { className: "hljs-keyword", children: "if" }), " keitaiso ", _jsx(_components.span, { className: "hljs-keyword", children: "not" }), " ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " [", _jsx(_components.span, { className: "hljs-string", children: "\"EOS\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"\"" }), "]: \n word,hinshi = keitaiso.split(", _jsx(_components.span, { className: "hljs-string", children: "\"\\t\"" }), ")\n t_temp = [word]+hinshi.split(", _jsx(_components.span, { className: "hljs-string", children: "\",\"" }), ")\n ", _jsx(_components.span, { className: "hljs-keyword", children: "if" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(t_temp) != ", _jsx(_components.span, { className: "hljs-number", children: "10" }), ": \n t_temp += [", _jsx(_components.span, { className: "hljs-string", children: "\"*\"" }), "]*(", _jsx(_components.span, { className: "hljs-number", children: "10" }), " - ", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(t_temp))\n t_list.append(t_temp)\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " t_list\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "parse_to_df" }), "(", _jsx(_components.span, { className: "hljs-params", children: "tweet_temp" }), "):\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " pd.DataFrame(parse(tweet_temp), columns=[", _jsx(_components.span, { className: "hljs-string", children: "\"単語\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"品詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"品詞細分類1\"" }), ", ", _jsx(_components.span, { className: "hljs-string", children: "\"品詞細分類2\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"品詞細分類3\"" }), ", ", _jsx(_components.span, { className: "hljs-string", children: "\"活用型\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"活用形\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"原形\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"読み\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"発音\"" }), "])\n"] }) }), "\n", _jsx(_components.p, { children: "次に、単語をBag-of-Words形式で保存し、LDAを使いやすい形に変形します。今回は、一般名詞と固有名詞のみを抽出します。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "make_lda_docs" }), "(", _jsx(_components.span, { className: "hljs-params", children: "texts" }), "):\n docs = []\n ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " text ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " texts: \n df = parse_to_df(text) \n extract_df = df[(df[", _jsx(_components.span, { className: "hljs-string", children: "\"品詞\"" }), "]+", _jsx(_components.span, { className: "hljs-string", children: "\"/\"" }), "+df[", _jsx(_components.span, { className: "hljs-string", children: "\"品詞細分類1\"" }), "]).isin([", _jsx(_components.span, { className: "hljs-string", children: "\"名詞/一般\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"名詞/固有名詞\"" }), "])] \n extract_df = extract_df[extract_df[", _jsx(_components.span, { className: "hljs-string", children: "\"原形\"" }), "]!=", _jsx(_components.span, { className: "hljs-string", children: "\"*\"" }), "] \n doc = []\n ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " genkei ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " extract_df[", _jsx(_components.span, { className: "hljs-string", children: "\"原形\"" }), "]: \n doc.append(genkei)\n docs.append(doc)\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " docs\n\ntexts = livedoor[", _jsx(_components.span, { className: "hljs-string", children: "\"body\"" }), "].values\ndocs = make_lda_docs(texts)\ndictionary = gensim.corpora.Dictionary(docs)\ncorpus = [dictionary.doc2bow(doc) ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " doc ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " docs]\n"] }) }), "\n", _jsx(_components.p, { children: "これでldaに入力するデータの作成が完了しました。" }), "\n", _jsx(_components.p, { children: "ちなみに、テキストに対して以下のような単語に分けています。" }), "\n", _jsx(_components.p, { children: _jsx(_components.img, { src: "/images/blog27/lda3.png", alt: "lda3", title: "lda3" }) }), "\n", _jsx(_components.p, { children: "さて、実際にLDAを使用してみましょう。" }), "\n", _jsx(_components.p, { children: "今回は、クラスターをとりあえず6個にして分割してみます。" }), "\n", _jsx(_components.p, { children: "クラスターの数は、何回も回してみて決めましょう。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["n_cluster = ", _jsx(_components.span, { className: "hljs-number", children: "6" }), "\nlda = gensim.models.LdaModel(\n corpus=corpus,\n id2word=dictionary,\n num_topics=n_cluster,\n minimum_probability=", _jsx(_components.span, { className: "hljs-number", children: "0.001" }), ",\n passes=", _jsx(_components.span, { className: "hljs-number", children: "20" }), ",\n update_every=", _jsx(_components.span, { className: "hljs-number", children: "0" }), ",\n chunksize=", _jsx(_components.span, { className: "hljs-number", children: "10000" }), ",\n random_state=", _jsx(_components.span, { className: "hljs-number", children: "1" }), ",\n)\n"] }) }), "\n", _jsx(_components.p, { children: "LDAのパラメータについては以下を参照してください。" }), "\n", _jsx(_components.p, { children: _jsx(_components.a, { href: "https://radimrehurek.com/gensim/models/ldamodel.html", children: "https://radimrehurek.com/gensim/models/ldamodel.html" }) }), "\n", _jsx(_components.p, { children: "最後に、学習したldaのベクトルをarrに格納します。" }), "\n", _jsx(_components.pre, { children: _jsx(_components.code, { className: "hljs language-python", children: "corpus_lda = lda[corpus]\narr = gensim.matutils.corpus2dense(corpus_lda, num_terms=n_cluster).T\n" }) }), "\n", _jsx(_components.h3, { children: "トピック-単語分布の可視化" }), "\n", _jsx(_components.p, { children: "実際にトピック-単語分布を見てみましょう。" }), "\n", _jsx(_components.p, { children: "お手軽に見るなら「lda.show_topics()」で見れますが、しっかり可視化するなら、棒グラフやDataFrameで表示すると見やすいです。" }), "\n", _jsx(_components.p, { children: "まずは棒グラフで表示する方法から。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["lists = []\n", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " i ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "range" }), "(n_cluster): \n temp_df = pd.DataFrame(lda.show_topic(i),columns=[", _jsx(_components.span, { className: "hljs-string", children: "\"word\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"score\"" }), "])\n temp_df[", _jsx(_components.span, { className: "hljs-string", children: "\"topic\"" }), "] = i \n lists.append(temp_df)\n\ntopic_word_df = pd.concat(lists,ignore_index=", _jsx(_components.span, { className: "hljs-literal", children: "True" }), ")\nfig, axes = plt.subplots(nrows=", _jsx(_components.span, { className: "hljs-number", children: "2" }), ", ncols=", _jsx(_components.span, { className: "hljs-number", children: "3" }), ", figsize=(", _jsx(_components.span, { className: "hljs-number", children: "20" }), ", ", _jsx(_components.span, { className: "hljs-number", children: "10" }), "))\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " i,gdf ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " topic_word_df.groupby(", _jsx(_components.span, { className: "hljs-string", children: "\"topic\"" }), "):\n gdf.set_index(", _jsx(_components.span, { className: "hljs-string", children: "\"word\"" }), ")[", _jsx(_components.span, { className: "hljs-string", children: "\"score\"" }), "].sort_values().plot.barh( ax=axes[i//", _jsx(_components.span, { className: "hljs-number", children: "3" }), ", i%", _jsx(_components.span, { className: "hljs-number", children: "3" }), "], title=", _jsx(_components.span, { className: "hljs-string", children: "\"topic {} のトピック-単語分布\"" }), ".", _jsx(_components.span, { className: "hljs-built_in", children: "format" }), "(i), color=", _jsx(_components.span, { className: "hljs-string", children: "\"blue\"" }), ")\n"] }) }), "\n", _jsx(_components.p, { children: _jsx(_components.img, { src: "/images/blog27/lda4.png", alt: "lda4", title: "lda4" }) }), "\n", _jsx(_components.p, { children: "次はテーブルで順位を表示してみます。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["topic_word_df[", _jsx(_components.span, { className: "hljs-string", children: "\"rank\"" }), "] = topic_word_df.groupby(", _jsx(_components.span, { className: "hljs-string", children: "\"topic\"" }), ")[", _jsx(_components.span, { className: "hljs-string", children: "\"score\"" }), "].rank()\ntopic_word_df.pivot(index=", _jsx(_components.span, { className: "hljs-string", children: "'topic'" }), ", columns=", _jsx(_components.span, { className: "hljs-string", children: "'rank'" }), ", values=", _jsx(_components.span, { className: "hljs-string", children: "'word'" }), ")\n"] }) }), "\n", _jsx(_components.p, { children: _jsx(_components.img, { src: "/images/blog27/lda5.png", alt: "lda5", title: "lda5" }) }), "\n", _jsx(_components.p, { children: "topic1と2はなんとなくスポーツの話題、3はIT系の話題、4は映画の話題、5は男女の話題みたいな意味付けができそうな気がしますね。" }), "\n", _jsx(_components.h3, { children: "文書-トピック分布の可視化" }), "\n", _jsx(_components.p, { children: "ライブドアニュースコーパスでは、メディアの媒体データも付与されています。" }), "\n", _jsx(_components.p, { children: "そこで、それぞれのトピックについて、どんなメディアの文書の構成になっているかを可視化してみます。" }), "\n", _jsx(_components.p, { children: "今回は、それぞれの文書に最も高い確率のトピックを割り当てました。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["livedoor_predict = livedoor.copy() \n", _jsx(_components.span, { className: "hljs-comment", children: "# topicの付与 " }), "\nlivedoor_predict[", _jsx(_components.span, { className: "hljs-string", children: "\"pred_topic\"" }), "] = np.argmax(arr,axis=", _jsx(_components.span, { className: "hljs-number", children: "1" }), ")\nlivedoor_predict[", _jsx(_components.span, { className: "hljs-string", children: "\"score\"" }), "] = np.", _jsx(_components.span, { className: "hljs-built_in", children: "max" }), "(arr,axis=", _jsx(_components.span, { className: "hljs-number", children: "1" }), ")\ncross = pd.crosstab(livedoor_predict[", _jsx(_components.span, { className: "hljs-string", children: "\"media\"" }), "],livedoor_predict[", _jsx(_components.span, { className: "hljs-string", children: "\"pred_topic\"" }), "]) \n\n", _jsx(_components.span, { className: "hljs-comment", children: "# トピックの文書割合の可視化" }), "\nfig, ax = plt.subplots(figsize=(", _jsx(_components.span, { className: "hljs-number", children: "10" }), ", ", _jsx(_components.span, { className: "hljs-number", children: "8" }), "))\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " i ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "range" }), "(", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(cross)): \n ax.barh(y=cross.columns, width = cross.iloc[i].values[::-", _jsx(_components.span, { className: "hljs-number", children: "1" }), "], left=cross.iloc[:i].", _jsx(_components.span, { className: "hljs-built_in", children: "sum" }), "()[::-", _jsx(_components.span, { className: "hljs-number", children: "1" }), "].values,tick_label=cross.columns[::-", _jsx(_components.span, { className: "hljs-number", children: "1" }), "])\n\nax.", _jsx(_components.span, { className: "hljs-built_in", children: "set" }), "(xlabel=", _jsx(_components.span, { className: "hljs-string", children: "'個数'" }), ", ylabel=", _jsx(_components.span, { className: "hljs-string", children: "'トピック'" }), ")\nax.legend(cross.index)\n\nplt.show()\n"] }) }), "\n", _jsx(_components.p, { children: _jsx(_components.img, { src: "/images/blog27/lda6.png", alt: "lda6", title: "lda6" }) }), "\n", _jsx(_components.p, { children: "トピック1,2はsports-watch、3はit-life-hackやkaden-channel,smax、4はmovie-enter、5はdokujo-tsushinが多くなりました。トピック-単語分布から想像していたものと近い結果が得られたのではないでしょうか。" }), "\n", _jsx(_components.h3, { children: "pyldavizによる可視化" }), "\n", _jsx(_components.p, { children: "上記の方法はまあまあプログラミングが必要でしたが、実は簡単に可視化することもできます。それがpyldavizです。以下のように使います。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "import" }), " pyLDAvis\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " pyLDAvis.gensim\npyLDAvis.enable_notebook()\nvis = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=", _jsx(_components.span, { className: "hljs-literal", children: "False" }), ") \nvis\n"] }) }), "\n", _jsx(_components.p, { children: "htmlで保存することもできます。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["pyLDAvis.save_html(vis, ", _jsx(_components.span, { className: "hljs-string", children: "\"pyldavis_output.html\"" }), ")\n"] }) }), "\n", _jsx(_components.p, { children: "実際の結果はこんな感じになります。" }), "\n", _jsx(_components.p, { children: _jsx(_components.a, { href: "./blog27_pyldavis.html", children: "pyldavisの結果" }) }), "\n", _jsx(_components.h3, { children: "まとめ" }), "\n", _jsx(_components.p, { children: "今回は、自然言語処理のトピックモデルの実践について解説していきました。文書がどんな性質を持っているのか可視化してみたいときはとても便利ですので、ぜひ使用してみてください!" }), "\n", _jsx(_components.h3, { children: "参考文献" }), "\n", _jsx(_components.p, { children: _jsx(_components.a, { href: "https://www.amazon.co.jp/%E3%83%88%E3%83%94%E3%83%83%E3%82%AF%E3%83%A2%E3%83%87%E3%83%AB%E3%81%AB%E3%82%88%E3%82%8B%E7%B5%B1%E8%A8%88%E7%9A%84%E6%BD%9C%E5%9C%A8%E6%84%8F%E5%91%B3%E8%A7%A3%E6%9E%90-%E8%87%AA%E7%84%B6%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA-%E4%BD%90%E8%97%A4%E4%B8%80%E8%AA%A0/dp/4339027588", children: "トピックモデルによる統計的潜在意味解析" }) })] }); } function MDXContent(props = {}) { const {wrapper: MDXLayout} = Object.assign({}, _provideComponents(), props.components); return MDXLayout ? _jsx(MDXLayout, Object.assign({}, props, { children: _jsx(_createMdxContent, props) })) : _createMdxContent(props); } return { default: MDXContent }; a:["$","div",null,{"className":"container mx-auto px-4 py-8 max-w-7xl","children":["$","div",null,{"className":"flex flex-col md:flex-row gap-8","children":[["$","article",null,{"className":"w-full md:w-[70%] bg-white shadow-lg rounded-lg p-8","children":[["$","header",null,{"className":"mb-8 border-b pb-4","children":[["$","div",null,{"className":"flex items-center gap-2 mb-2","children":[["$","$L6",null,{"href":"/","className":"text-gray-500 hover:text-orange-500","children":"Home"}],["$","span",null,{"className":"text-gray-300","children":">"}],["$","span",null,{"className":"text-gray-500","children":"$undefined"}]]}],["$","h1",null,{"className":"text-3xl font-bold mb-4","children":"PythonでLDA(トピックモデル)の実装"}],["$","div",null,{"className":"text-gray-500 flex flex-wrap gap-4 items-center mb-6","children":[["$","time",null,{"className":"flex items-center gap-1 text-sm","children":[["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","height":"18px","viewBox":"0 0 24 24","width":"18px","fill":"#666666","className":"mr-1","children":[["$","path",null,{"d":"M0 0h24v24H0V0z","fill":"none"}],["$","path",null,{"d":"M19 3h-1V1h-2v2H8V1H6v2H5c-1.11 0-2 .9-2 2v14c0 1.1.89 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm0 16H5V8h14v11zM7 10h5v5H7z"}]]}],"日付: ","2020-10-31"]}],["$","div",null,{"className":"flex flex-wrap gap-2","children":[["$","$L6","自然言語処理",{"href":"/tags/自然言語処理","className":"bg-gray-100 hover:bg-blue-100 px-2 py-1 rounded text-sm text-gray-600 hover:text-blue-600 transition-colors","children":["#","自然言語処理"]}]]}]]}]]}],["$","div",null,{"className":"prose max-w-none prose-headings:border-b prose-headings:pb-2 prose-a:text-orange-600","children":["$","$Lb",null,{"compiledSource":"$c","frontmatter":{},"scope":{}}]}],["$","div",null,{"className":"flex justify-between items-center mt-12","children":[["$","div",null,{"className":"flex-1","children":["$","$L6",null,{"href":"/blog26","className":"group flex flex-col items-start p-4 rounded-lg border border-gray-100 hover:border-orange-200 hover:bg-orange-50 transition-all","children":[["$","span",null,{"className":"text-sm text-gray-500 mb-1 group-hover:text-orange-600 flex items-center","children":[["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","className":"h-4 w-4 mr-1","fill":"none","viewBox":"0 0 24 24","stroke":"currentColor","children":["$","path",null,{"strokeLinecap":"round","strokeLinejoin":"round","strokeWidth":2,"d":"M15 19l-7-7 7-7"}]}],"Previous"]}],["$","span",null,{"className":"font-medium text-gray-800 dark:text-gray-200 line-clamp-2","children":"線形回帰分析のt値がt分布に従う理由(証明)"}]]}]}],["$","div",null,{"className":"w-8"}],["$","div",null,{"className":"flex-1 text-right","children":["$","$L6",null,{"href":"/blog28","className":"group flex flex-col items-end p-4 rounded-lg border border-gray-100 hover:border-orange-200 hover:bg-orange-50 transition-all","children":[["$","span",null,{"className":"text-sm text-gray-500 mb-1 group-hover:text-orange-600 flex items-center","children":["Next",["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","className":"h-4 w-4 ml-1","fill":"none","viewBox":"0 0 24 24","stroke":"currentColor","children":["$","path",null,{"strokeLinecap":"round","strokeLinejoin":"round","strokeWidth":2,"d":"M9 5l7 7-7 7"}]}]]}],["$","span",null,{"className":"font-medium text-gray-800 dark:text-gray-200 line-clamp-2","children":"日本語ポジネガ分析の教師データ作成はYahoo!ローカルサーチAPIがおすすめって話"}]]}]}]]}],["$","$Ld",null,{}]]}],["$","div",null,{"className":"w-full md:w-[30%]","children":"$Le"}]]}]}] 9:null f:I[9009,["326","static/chunks/326-ead410bae2047633.js","986","static/chunks/986-f27c5a2c4d841870.js","42","static/chunks/app/%5Bslug%5D/page-419f452b4066bb25.js"],""] e:["$","aside",null,{"className":"w-full flex flex-col gap-6","children":[["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"プロフィール"}],["$","div",null,{"className":"flex items-center mb-4","children":[["$","$Lf",null,{}],["$","span",null,{"className":"font-bold","children":"Shingo.S"}]]}],["$","p",null,{"className":"mb-4 text-base leading-relaxed text-gray-600","children":["データサイエンティストとして働いています。仕事では主にPythonやSAS、Rを用いて分析しています。",["$","br",null,{}],["$","br",null,{}],"統計検定1級、Kaggle Expert。",["$","br",null,{}],"自然言語処理に興味があります。"]}],["$","div",null,{"className":"mt-4 flex items-center","children":["$","a",null,{"href":"https://x.com/shingo97358922","target":"_blank","rel":"noopener noreferrer","className":"flex items-center gap-2 text-gray-600 hover:text-black transition-colors","children":[["$","svg",null,{"className":"w-5 h-5","fill":"currentColor","viewBox":"0 0 24 24","aria-hidden":"true","children":["$","path",null,{"d":"M18.244 2.25h3.308l-7.227 8.26 8.502 11.24H16.17l-5.214-6.817L4.99 21.75H1.68l7.73-8.835L1.254 2.25H8.08l4.713 6.231zm-1.161 17.52h1.833L7.084 4.126H5.117z"}]}],["$","span",null,{"className":"text-sm font-medium","children":"@shingo97358922"}]]}]}]]}],["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"アクセスカウンター"}],["$","div",null,{"className":"flex flex-col gap-2 text-gray-600 pl-2","children":[["$","div",null,{"className":"flex items-center gap-4","children":[["$","span",null,{"className":"w-12 text-right","children":"累計"}],["$","span",null,{"children":":"}],["$","span",null,{"className":"font-mono text-lg font-bold","children":"72320"}]]}],["$","div",null,{"className":"flex items-center gap-4","children":[["$","span",null,{"className":"w-12 text-right","children":"本日"}],["$","span",null,{"children":":"}],["$","span",null,{"className":"font-mono text-lg font-bold","children":"17"}]]}],["$","div",null,{"className":"flex items-center gap-4","children":[["$","span",null,{"className":"w-12 text-right","children":"昨日"}],["$","span",null,{"children":":"}],["$","span",null,{"className":"font-mono text-lg font-bold","children":"33"}]]}]]}]]}],["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"月別アーカイブ"}],["$","ul",null,{"className":"text-base text-gray-600","children":[["$","li","2026年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2026-02","className":"hover:text-blue-500 transition-colors block","children":["2026年2月"," (",1,")"]}]}],["$","li","2025年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2025-02","className":"hover:text-blue-500 transition-colors block","children":["2025年2月"," (",1,")"]}]}],["$","li","2024年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2024-10","className":"hover:text-blue-500 transition-colors block","children":["2024年10月"," (",1,")"]}]}],["$","li","2024年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2024-07","className":"hover:text-blue-500 transition-colors block","children":["2024年7月"," (",1,")"]}]}],["$","li","2024年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2024-02","className":"hover:text-blue-500 transition-colors block","children":["2024年2月"," (",1,")"]}]}],["$","li","2023年6月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2023-06","className":"hover:text-blue-500 transition-colors block","children":["2023年6月"," (",1,")"]}]}],["$","li","2023年5月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2023-05","className":"hover:text-blue-500 transition-colors block","children":["2023年5月"," (",1,")"]}]}],["$","li","2023年3月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2023-03","className":"hover:text-blue-500 transition-colors block","children":["2023年3月"," (",2,")"]}]}],["$","li","2022年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2022-10","className":"hover:text-blue-500 transition-colors block","children":["2022年10月"," (",2,")"]}]}],["$","li","2022年9月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2022-09","className":"hover:text-blue-500 transition-colors block","children":["2022年9月"," (",2,")"]}]}],["$","li","2022年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2022-07","className":"hover:text-blue-500 transition-colors block","children":["2022年7月"," (",1,")"]}]}],["$","li","2022年3月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2022-03","className":"hover:text-blue-500 transition-colors block","children":["2022年3月"," (",1,")"]}]}],["$","li","2021年11月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-11","className":"hover:text-blue-500 transition-colors block","children":["2021年11月"," (",1,")"]}]}],["$","li","2021年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-10","className":"hover:text-blue-500 transition-colors block","children":["2021年10月"," (",2,")"]}]}],["$","li","2021年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-07","className":"hover:text-blue-500 transition-colors block","children":["2021年7月"," (",2,")"]}]}],["$","li","2021年5月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-05","className":"hover:text-blue-500 transition-colors block","children":["2021年5月"," (",2,")"]}]}],["$","li","2021年4月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-04","className":"hover:text-blue-500 transition-colors block","children":["2021年4月"," (",2,")"]}]}],["$","li","2021年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-02","className":"hover:text-blue-500 transition-colors block","children":["2021年2月"," (",1,")"]}]}],["$","li","2020年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-10","className":"hover:text-blue-500 transition-colors block","children":["2020年10月"," (",1,")"]}]}],["$","li","2020年9月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-09","className":"hover:text-blue-500 transition-colors block","children":["2020年9月"," (",1,")"]}]}],["$","li","2020年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-07","className":"hover:text-blue-500 transition-colors block","children":["2020年7月"," (",1,")"]}]}],["$","li","2020年6月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-06","className":"hover:text-blue-500 transition-colors block","children":["2020年6月"," (",1,")"]}]}],["$","li","2020年5月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-05","className":"hover:text-blue-500 transition-colors block","children":["2020年5月"," (",1,")"]}]}],["$","li","2020年4月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-04","className":"hover:text-blue-500 transition-colors block","children":["2020年4月"," (",1,")"]}]}],["$","li","2020年3月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-03","className":"hover:text-blue-500 transition-colors block","children":["2020年3月"," (",1,")"]}]}],["$","li","2020年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-02","className":"hover:text-blue-500 transition-colors block","children":["2020年2月"," (",1,")"]}]}],["$","li","2020年1月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-01","className":"hover:text-blue-500 transition-colors block","children":["2020年1月"," (",1,")"]}]}],["$","li","2019年12月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-12","className":"hover:text-blue-500 transition-colors block","children":["2019年12月"," (",1,")"]}]}],["$","li","2019年11月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-11","className":"hover:text-blue-500 transition-colors block","children":["2019年11月"," (",2,")"]}]}],["$","li","2019年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-10","className":"hover:text-blue-500 transition-colors block","children":["2019年10月"," (",2,")"]}]}],["$","li","2019年9月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-09","className":"hover:text-blue-500 transition-colors block","children":["2019年9月"," (",1,")"]}]}],["$","li","2019年8月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-08","className":"hover:text-blue-500 transition-colors block","children":["2019年8月"," (",8,")"]}]}],["$","li","2019年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-07","className":"hover:text-blue-500 transition-colors block","children":["2019年7月"," (",1,")"]}]}],["$","li","2019年6月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-06","className":"hover:text-blue-500 transition-colors block","children":["2019年6月"," (",2,")"]}]}]]}]]}],["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"タグ一覧"}],["$","div",null,{"className":"flex flex-wrap gap-2","children":[["$","$L6","データ分析",{"href":"/tags/データ分析","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["データ分析"," (",18,")"]}],["$","$L6","自然言語処理",{"href":"/tags/自然言語処理","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["自然言語処理"," (",16,")"]}],["$","$L6","SAS",{"href":"/tags/SAS","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["SAS"," (",12,")"]}],["$","$L6","Kaggle",{"href":"/tags/Kaggle","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["Kaggle"," (",3,")"]}],["$","$L6","雑談",{"href":"/tags/雑談","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["雑談"," (",2,")"]}]]}]]}],["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"広告枠"}],["$","div",null,{"className":"flex justify-center items-center","children":[["$","a",null,{"href":"https://px.a8.net/svt/ejp?a8mat=4AXI0F+CULTTE+348+6CWQP","rel":"nofollow","target":"_blank","children":["$","img",null,{"style":{"border":0},"width":"250","height":"250","alt":"","src":"https://www25.a8.net/svt/bgt?aid=260223855777&wid=001&eno=01&mid=s00000000404001068000&mc=1"}]}],["$","img",null,{"style":{"border":0},"width":"1","height":"1","src":"https://www17.a8.net/0.gif?a8mat=4AXI0F+CULTTE+348+6CWQP","alt":""}]]}]]}]]}]