1:HL["/_next/static/media/e4af272ccee01ff0-s.p.woff2","font",{"crossOrigin":"","type":"font/woff2"}] 2:HL["/_next/static/css/97132489b96da1d5.css","style",{"crossOrigin":""}] 0:["Y_TW_5cOL4VPb7FuqRz3I",[[["",{"children":[["slug","blog19","d"],{"children":["__PAGE__?{\"slug\":\"blog19\"}",{}]}]},"$undefined","$undefined",true],"$L3",[[["$","link","0",{"rel":"stylesheet","href":"/_next/static/css/97132489b96da1d5.css","precedence":"next","crossOrigin":""}]],"$L4"]]]] 5:HL["/_next/static/css/5b2728e81018a7be.css","style",{"crossOrigin":""}] 6:I[8326,["326","static/chunks/326-ead410bae2047633.js","986","static/chunks/986-f27c5a2c4d841870.js","42","static/chunks/app/%5Bslug%5D/page-419f452b4066bb25.js"],""] 7:I[6954,[],""] 8:I[7264,[],""] 3:[null,["$","html",null,{"lang":"ja","children":["$","body",null,{"className":"__className_f367f3 flex flex-col min-h-screen","children":[["$","header",null,{"className":"blog-header py-5","children":["$","div",null,{"className":"container mx-auto px-4","children":["$","div",null,{"className":"flex flex-col items-start","children":[["$","$L6",null,{"href":"/","className":"hover:no-underline","children":["$","h1",null,{"className":"text-6xl font-serif text-gray-800 mb-2 font-normal","children":"Shingoの数学ノート"}]}],["$","p",null,{"className":"text-xl text-gray-400 font-normal","children":"プログラミングと機械学習のメモ"}]]}]}]}],["$","main",null,{"className":"flex-grow","children":["$","$L7",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","template":["$","$L8",null,{}],"templateStyles":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"childProp":{"current":["$","$L7",null,{"parallelRouterKey":"children","segmentPath":["children",["slug","blog19","d"],"children"],"loading":"$undefined","loadingStyles":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","template":["$","$L8",null,{}],"templateStyles":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","childProp":{"current":["$L9","$La",null],"segment":"__PAGE__?{\"slug\":\"blog19\"}"},"styles":[["$","link","0",{"rel":"stylesheet","href":"/_next/static/css/5b2728e81018a7be.css","precedence":"next","crossOrigin":""}]]}],"segment":["slug","blog19","d"]},"styles":[]}]}],["$","footer",null,{"className":"bg-[#DDDDDD] text-[#999999] py-8 mt-12 text-center border-t border-[#e5e5e5]","children":["$","div",null,{"className":"container mx-auto px-4","children":[["$","p",null,{"className":"mb-2","children":"© All rights reserved by Shingo Sekine."}],["$","p",null,{"children":["$","a",null,{"href":"#","className":"hover:text-blue-500 transition-colors","children":"Back to top"}]}]]}]}]]}]}],null] 4:[["$","meta","0",{"charSet":"utf-8"}],["$","title","1",{"children":"Shingoの数学ノート"}],["$","meta","2",{"name":"description","content":"プログラミング言語と機械学習のメモ"}],["$","meta","3",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","link","4",{"rel":"icon","href":"/favicon.ico","type":"image/x-icon","sizes":"256x256"}],["$","meta","5",{"name":"next-size-adjust"}]] b:I[6180,["326","static/chunks/326-ead410bae2047633.js","986","static/chunks/986-f27c5a2c4d841870.js","42","static/chunks/app/%5Bslug%5D/page-419f452b4066bb25.js"],""] d:I[9703,["326","static/chunks/326-ead410bae2047633.js","986","static/chunks/986-f27c5a2c4d841870.js","42","static/chunks/app/%5Bslug%5D/page-419f452b4066bb25.js"],""] c:Td814,/*@jsxRuntime automatic @jsxImportSource react*/ const {Fragment: _Fragment, jsx: _jsx, jsxs: _jsxs} = arguments[0]; const {useMDXComponents: _provideComponents} = arguments[0]; function _createMdxContent(props) { const _components = Object.assign({ p: "p", a: "a", h3: "h3", pre: "pre", code: "code", span: "span" }, _provideComponents(), props.components); return _jsxs(_Fragment, { children: [_jsxs(_components.p, { children: [_jsx(_components.a, { href: "./blog18.html", children: "前回の続き" }), "です。wikipediaをコーパスにして、Skip-Gram with negative samplingを実装しようと思います。"] }), "\n", _jsx(_components.h3, { children: "wikipediaからコーパスを作成しよう" }), "\n", _jsx(_components.p, { children: "まずは、wikipediaからデータを取得します。全部のデータで実行するととても時間がかかるため、一部だけ使用します。以下のデータをダウンロードしてください。" }), "\n", _jsx(_components.p, { children: _jsx(_components.a, { href: "https://dumps.wikimedia.org/jawiki/20191201/jawiki-20191201-pages-articles1.xml-p1p106175.bz2", children: "https://dumps.wikimedia.org/jawiki/20191201/jawiki-20191201-pages-articles1.xml-p1p106175.bz2" }) }), "\n", _jsx(_components.p, { children: "その後、WikiExtractor.pyを使用して使いやすい形にします。以下のサイトからWikiExtractor.pyをダウンロードしましょう。" }), "\n", _jsx(_components.p, { children: _jsx(_components.a, { href: "http://medialab.di.unipi.it/Project/SemaWiki/Tools/WikiExtractor.py", children: "http://medialab.di.unipi.it/Project/SemaWiki/Tools/WikiExtractor.py" }) }), "\n", _jsx(_components.p, { children: "次に、そのディレクトリで以下のコマンドを打ちます。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["python WikiExtractor.py -cb 250K -o extracted jawiki-", _jsx(_components.span, { className: "hljs-number", children: "20191201" }), "-pages-articles1.xml-p1p106175.bz2\n"] }) }), "\n", _jsx(_components.p, { children: "extractedの下に「AA」などのフォルダが出来ていれば成功です。" }), "\n", _jsx(_components.p, { children: "中身は次のようになっているため、タグを取り除く等のクレンジングを行う必要があります。" }), "\n", _jsx(_components.pre, { children: _jsx(_components.code, { children: "\\nUSアレッツォ\\n\\nSSアレッツォ(Società Sportiva Arezzo S.r.l.)はイタリアのトスカーナ州アレッツォを本拠地とするサッカークラブチームである(旧称USアレッツォ(\"Unione Sportiva Arezzo\")。2018-19シーズンはセリエCに所属している。\\n1923年、名門ユヴェントスにあやかり、\"ユヴェントスFCアレッツォ\"(\"Juventus Football Club Arezzo\")として創設。1930年にUSアレッツォ(\"Unione Sportiva Arezzo\")と名前を変えた。\n" }) }), "\n", _jsx(_components.p, { children: "ファイルを全て読み込み、クレンジングをして1つのDataFrameにします。(カレントディレクトリにdataフォルダを作成してください。)" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "import" }), " glob\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " bz2\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " re\n", _jsx(_components.span, { className: "hljs-keyword", children: "from" }), " tqdm ", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " tqdm_notebook\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " pandas ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " pd \n\npaths = glob.glob(", _jsx(_components.span, { className: "hljs-string", children: "\"./extracted/*/*\"" }), ") \n\n", _jsx(_components.span, { className: "hljs-comment", children: "# ()の中身を削除する" }), "\n", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "sakujo" }), "(", _jsxs(_components.span, { className: "hljs-params", children: ["strs, start=", _jsx(_components.span, { className: "hljs-string", children: "\"(\"" }), ", end=", _jsx(_components.span, { className: "hljs-string", children: "\")\"" })] }), "): \n cnt = ", _jsx(_components.span, { className: "hljs-number", children: "0" }), " \n text = ", _jsx(_components.span, { className: "hljs-string", children: "\"\"" }), "\n ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " s ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " strs:\n ", _jsx(_components.span, { className: "hljs-keyword", children: "if" }), " s == start: cnt += ", _jsx(_components.span, { className: "hljs-number", children: "1" }), "\n ", _jsx(_components.span, { className: "hljs-keyword", children: "if" }), " cnt == ", _jsx(_components.span, { className: "hljs-number", children: "0" }), ": text += s\n ", _jsx(_components.span, { className: "hljs-keyword", children: "if" }), " s == end ", _jsx(_components.span, { className: "hljs-keyword", children: "and" }), " cnt >= ", _jsx(_components.span, { className: "hljs-number", children: "1" }), ": cnt -= ", _jsx(_components.span, { className: "hljs-number", children: "1" }), "\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " text \n \n", _jsx(_components.span, { className: "hljs-comment", children: "# 実際の処理。タグや()を外してクレンジング。 " }), "\nlists = []\n", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " path ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " tqdm_notebook(paths): \n ", _jsx(_components.span, { className: "hljs-keyword", children: "with" }), " bz2.", _jsx(_components.span, { className: "hljs-built_in", children: "open" }), "(path) ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " f: \n text = f.read().decode(", _jsx(_components.span, { className: "hljs-string", children: "\"utf8\"" }), ") \n text = re.sub(", _jsx(_components.span, { className: "hljs-string", children: "\"<.*?>\\n.*?\\n\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"\"" }), ",text) \n text = re.sub(", _jsx(_components.span, { className: "hljs-string", children: "\"<.*?>\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"\"" }), ",text) \n text = re.sub(", _jsx(_components.span, { className: "hljs-string", children: "\"\\n\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"\"" }), ",text) \n text = re.sub(", _jsx(_components.span, { className: "hljs-string", children: "\"\\(.*?\\)\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"\"" }), ",text) \n text = sakujo(text) \n lists.append([path,text]) \n \nwiki_df = pd.DataFrame(lists,columns=[", _jsx(_components.span, { className: "hljs-string", children: "\"path\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"text\"" }), "]) \nwiki_df.to_pickle(", _jsx(_components.span, { className: "hljs-string", children: "\"./data/wiki_df.pkl\"" }), ")\n"] }) }), "\n", _jsx(_components.p, { children: "これにより、wikipediaのクレンジングが完了しました。" }), "\n", _jsx(_components.h3, { children: "分かち書きとデータ分割" }), "\n", _jsxs(_components.p, { children: ["次に、分かち書きを行います。今回はMeCab(Neologd)を使用します。まだインストールしてない方は", _jsx(_components.a, { href: "/blog12.html", children: "こちら" }), "を参照してください。"] }), "\n", _jsx(_components.p, { children: "まず、学習対象の品詞を決定します。割と悩みましたが、以下の品詞を抽出します。また、stop wordも設定します。(頻出のwordをstop wordにしています。)" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["keep_hinshi=[ [", _jsx(_components.span, { className: "hljs-string", children: "\"形容詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"自立\"" }), "], [", _jsx(_components.span, { className: "hljs-string", children: "\"動詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"自立\"" }), "], [", _jsx(_components.span, { className: "hljs-string", children: "\"副詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"一般\"" }), "], [", _jsx(_components.span, { className: "hljs-string", children: "\"名詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"サ変接続\"" }), "], [", _jsx(_components.span, { className: "hljs-string", children: "\"名詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"ナイ形容詞語幹\"" }), "], [", _jsx(_components.span, { className: "hljs-string", children: "\"名詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"一般\"" }), "], [", _jsx(_components.span, { className: "hljs-string", children: "\"名詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"形容動詞語幹\"" }), "], [", _jsx(_components.span, { className: "hljs-string", children: "\"名詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"固有名詞\"" }), "], [", _jsx(_components.span, { className: "hljs-string", children: "\"名詞\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"副詞可能\"" }), "] ] sw=[", _jsx(_components.span, { className: "hljs-string", children: "\"*\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"\\n\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"する\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"なる\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"ある\"" }), ",", _jsx(_components.span, { className: "hljs-string", children: "\"いう\"" }), "]\n"] }) }), "\n", _jsx(_components.p, { children: "また、形態素解析も行います。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "import" }), " MeCab\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "ja_parse" }), "(", _jsxs(_components.span, { className: "hljs-params", children: ["tweet, keep_hinshi, stop_word=[", _jsx(_components.span, { className: "hljs-string", children: "\"\"" }), ", ", _jsx(_components.span, { className: "hljs-string", children: "\"*\"" }), ", ", _jsx(_components.span, { className: "hljs-string", children: "\"\\n\"" }), "]"] }), "): \n t = MeCab.Tagger()\n temp1 = t.parse(tweet)\n temp2 = temp1.split(", _jsx(_components.span, { className: "hljs-string", children: "\"\\n\"" }), ")[:-", _jsx(_components.span, { className: "hljs-number", children: "2" }), "]\n \n t_list = [[i.split(", _jsx(_components.span, { className: "hljs-string", children: "\"\\t\"" }), ")[", _jsx(_components.span, { className: "hljs-number", children: "0" }), "]] + i.split(", _jsx(_components.span, { className: "hljs-string", children: "\"\\t\"" }), ")[", _jsx(_components.span, { className: "hljs-number", children: "1" }), "].split(", _jsx(_components.span, { className: "hljs-string", children: "\",\"" }), ") ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " i ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " temp2] \n output = [w[", _jsx(_components.span, { className: "hljs-number", children: "7" }), "] ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " w ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " t_list ", _jsx(_components.span, { className: "hljs-keyword", children: "if" }), " w[", _jsx(_components.span, { className: "hljs-number", children: "7" }), "] ", _jsx(_components.span, { className: "hljs-keyword", children: "not" }), " ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " stop_word ", _jsx(_components.span, { className: "hljs-keyword", children: "and" }), " w[", _jsx(_components.span, { className: "hljs-number", children: "1" }), ":", _jsx(_components.span, { className: "hljs-number", children: "3" }), "] ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " keep_hinshi]\n \n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " output\n"] }) }), "\n", _jsx(_components.p, { children: "実際にwikipediaの形態素解析を実行します。形態素解析では早い部類のMeCabでも時間がかかるので、並列処理を行なっています。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "from" }), " joblib ", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " Parallel, delayed\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " numpy ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " np\n\nlists = Parallel(n_jobs=", _jsx(_components.span, { className: "hljs-number", children: "4" }), ", verbose=", _jsx(_components.span, { className: "hljs-number", children: "1" }), ")([delayed(", _jsx(_components.span, { className: "hljs-keyword", children: "lambda" }), " x: ja_parse(x, keep_hinshi, stop_word=sw))(w) ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " w ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " wiki_df[", _jsx(_components.span, { className: "hljs-string", children: "\"text\"" }), "]])\n"] }) }), "\n", _jsx(_components.p, { children: "ここから、使用する単語を絞り、単語のidを割り振り、分布を算出します。今回は、30より多く出現している単語に絞りました。全部で167621個あります。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["minthr = ", _jsx(_components.span, { className: "hljs-number", children: "30" }), " \n", _jsx(_components.span, { className: "hljs-comment", children: "# listsを1次元にする。 " }), "\nar = []\n", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " i,ws ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "enumerate" }), "(lists):\n ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " w ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " ws: \n ar.append(w) \n\n", _jsx(_components.span, { className: "hljs-comment", children: "# min_thrより大きい単語のみを残す。 " }), "\ns = pd.Series(ar).value_counts() \nr = s[s>minthr].rank(ascending=", _jsx(_components.span, { className: "hljs-literal", children: "False" }), ", method=", _jsx(_components.span, { className: "hljs-string", children: "\"first\"" }), ").astype(", _jsx(_components.span, { className: "hljs-built_in", children: "int" }), ")-", _jsx(_components.span, { className: "hljs-number", children: "1" }), " \nword2id = r.to_dict() \nid2word = {v:k ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " k,v ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " word2id.items()} \n\nword_hist = pd.Series(ar).value_counts().loc[np.array([id2word[i] ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " i ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "range" }), "(", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(id2word))])].values\n"] }) }), "\n", _jsx(_components.p, { children: "作成したidや分布を保存します。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "import" }), " pickle \n", _jsx(_components.span, { className: "hljs-keyword", children: "with" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "open" }), "(", _jsx(_components.span, { className: "hljs-string", children: "\"./data/w2vconf.pkl\"" }), ", ", _jsx(_components.span, { className: "hljs-string", children: "\"wb\"" }), ") ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " f: \n pickle.dump({ ", _jsx(_components.span, { className: "hljs-string", children: "\"word2id\"" }), ":word2id, ", _jsx(_components.span, { className: "hljs-string", children: "\"id2word\"" }), ":id2word, ", _jsx(_components.span, { className: "hljs-string", children: "\"word_hist\"" }), ":word_hist }, f)\n"] }) }), "\n", _jsx(_components.p, { children: "また、私のPCのメモリは16GBですが、全然足りなかったため、バッチごとにファイルを読みに行くことにしました。と言うわけで、バッチごとにファイルを分割します。" }), "\n", _jsx(_components.p, { children: "今回はwindow=4にして、前後4つずつ単語を取得し、それをyとしています。また、バッチサイズは1000にしました。先にdataフォルダの直下にwakatiフォルダを作成してください。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["window = ", _jsx(_components.span, { className: "hljs-number", children: "4" }), " \nsize = ", _jsx(_components.span, { className: "hljs-number", children: "1000" }), " \ncnt = ", _jsx(_components.span, { className: "hljs-number", children: "0" }), " \nX = []\ny = []\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " idx ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " tqdm_notebook(", _jsx(_components.span, { className: "hljs-built_in", children: "range" }), "(window, ", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(train_w)-window)): \n X.append(train_w[idx])\n y.append(np.concatenate([train_w[idx-window:idx], train_w[idx+", _jsx(_components.span, { className: "hljs-number", children: "1" }), ":idx+window+", _jsx(_components.span, { className: "hljs-number", children: "1" }), "]]))\n\n ", _jsx(_components.span, { className: "hljs-keyword", children: "if" }), " (idx+", _jsx(_components.span, { className: "hljs-number", children: "1" }), ") % ", _jsx(_components.span, { className: "hljs-number", children: "1000" }), " == ", _jsx(_components.span, { className: "hljs-number", children: "0" }), ":\n np.savez_compressed(", _jsx(_components.span, { className: "hljs-string", children: "\"./data/wakati/wiki_wakati_{}.npz\"" }), ".", _jsx(_components.span, { className: "hljs-built_in", children: "format" }), "(cnt), X=np.array(X), y=np.array(y)) \n X = []\n y = []\n cnt += ", _jsx(_components.span, { className: "hljs-number", children: "1" }), "\n"] }) }), "\n", _jsx(_components.p, { children: "ここで前処理が終わります。このブログではあまり変数の中身を確認しませんので適宜変数を確認していってくださいね。" }), "\n", _jsx(_components.h3, { children: "SGNSを実装しよう" }), "\n", _jsx(_components.p, { children: "ようやく本題です。pytorchの流れとして、dataset→dataloader→modelの作成があります。まずはdatasetを作成しましょう。" }), "\n", _jsx(_components.p, { children: "とはいっても、前準備で正例データは作成しましたので、ただ読み込むだけです。負例データはnp.random.choiceを使用します。(本来は正例データ以外の単語を使用するべきですが、時間がかかるため全体の単語の重み付きランダムサンプリングになっています。)" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "import" }), " torch\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " torch.nn ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " nn\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " torch.nn.functional ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " F\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " torch.optim ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " optim\n", _jsx(_components.span, { className: "hljs-keyword", children: "from" }), " torch.utils.data ", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " DataLoader, TensorDataset\n", _jsx(_components.span, { className: "hljs-keyword", children: "from" }), " torch.autograd ", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " Variable\n", _jsx(_components.span, { className: "hljs-keyword", children: "import" }), " numpy ", _jsx(_components.span, { className: "hljs-keyword", children: "as" }), " np\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "class" }), " ", _jsx(_components.span, { className: "hljs-title class_", children: "w2vdataset" }), "(torch.utils.data.Dataset):\n ", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "__init__" }), "(", _jsxs(_components.span, { className: "hljs-params", children: ["self, paths, word_hist, ", _jsx(_components.span, { className: "hljs-built_in", children: "pow" }), "=", _jsx(_components.span, { className: "hljs-number", children: "3" }), "/", _jsx(_components.span, { className: "hljs-number", children: "4" }), ", sample=", _jsx(_components.span, { className: "hljs-number", children: "5" })] }), "): \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".paths = paths \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), " = ", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(paths) \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".word_hist = word_hist \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".word_len = ", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(word_hist) \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".word_weight = ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".word_hist**", _jsx(_components.span, { className: "hljs-built_in", children: "pow" }), " / (", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".word_hist**", _jsx(_components.span, { className: "hljs-built_in", children: "pow" }), ").", _jsx(_components.span, { className: "hljs-built_in", children: "sum" }), "() \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".sample = sample\n\n ", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "__len__" }), "(", _jsx(_components.span, { className: "hljs-params", children: "self" }), "):\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "\n\n ", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "negative_sampling" }), "(", _jsx(_components.span, { className: "hljs-params", children: "self, outputs" }), "): \n ar = np.arange(", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".word_hist)) \n c = np.random.choice(ar, p=", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".word_weight, size=", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".sample*outputs.shape[", _jsx(_components.span, { className: "hljs-number", children: "0" }), "]*outputs.shape[", _jsx(_components.span, { className: "hljs-number", children: "1" }), "]).reshape(outputs.shape[", _jsx(_components.span, { className: "hljs-number", children: "0" }), "],outputs.shape[", _jsx(_components.span, { className: "hljs-number", children: "1" }), "]*", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".sample)\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " c\n\n ", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "__getitem__" }), "(", _jsx(_components.span, { className: "hljs-params", children: "self, idx" }), "): \n l = np.load(", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".paths[idx]) \n X, y = l[", _jsx(_components.span, { className: "hljs-string", children: "\"X\"" }), "], l[", _jsx(_components.span, { className: "hljs-string", children: "\"y\"" }), "] \n nega_words = ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".negative_sampling(y)\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " X, y, nega_words\n"] }) }), "\n", _jsx(_components.p, { children: "datasetとdataloaderをインスタンス化します。batch_size=1となっていますが、実際のサイズは1000です。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["wakati_paths = glob.glob(", _jsx(_components.span, { className: "hljs-string", children: "\"./data/wakati/wiki_wakati_*.npz\"" }), ") \nbatch_size = ", _jsx(_components.span, { className: "hljs-number", children: "1" }), " \n", _jsx(_components.span, { className: "hljs-built_in", children: "pow" }), " = ", _jsx(_components.span, { className: "hljs-number", children: "3" }), "/", _jsx(_components.span, { className: "hljs-number", children: "4" }), " \nsample = ", _jsx(_components.span, { className: "hljs-number", children: "5" }), " \nds = w2vdataset(wakati_paths, word_hist, ", _jsx(_components.span, { className: "hljs-built_in", children: "pow" }), "=", _jsx(_components.span, { className: "hljs-built_in", children: "pow" }), ", sample=sample) \ndl = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=", _jsx(_components.span, { className: "hljs-literal", children: "True" }), ")\n"] }) }), "\n", _jsx(_components.p, { children: "肝心のSGNSです。といってもすごくシンプルで、前回の図の通りに組んでいきます。inputsとpoji,negaのembedding結果をそれぞれ掛け合わせて、sigmoidを噛ませるだけです。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["dim = ", _jsx(_components.span, { className: "hljs-number", children: "200" }), " \ntorch.manual_seed(", _jsx(_components.span, { className: "hljs-number", children: "1" }), ")\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "class" }), " ", _jsx(_components.span, { className: "hljs-title class_", children: "word2vec_SGNS" }), "(nn.Module):\n ", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "__init__" }), "(", _jsx(_components.span, { className: "hljs-params", children: "self,dim" }), "): \n ", _jsx(_components.span, { className: "hljs-built_in", children: "super" }), "(word2vec_SGNS, ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ").__init__() \n ", _jsx(_components.span, { className: "hljs-comment", children: "# embedding layerの設定 " }), "\n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".embeds_i = nn.Embedding(", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(word2id), dim) \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".embeds_o = nn.Embedding(", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(word2id), dim) \n ", _jsx(_components.span, { className: "hljs-comment", children: "# 初期化 " }), "\n initrange = ", _jsx(_components.span, { className: "hljs-number", children: "1" }), "/dim \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".embeds_i.weight.data.uniform_(-initrange, initrange) \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".embeds_o.weight.data.uniform_(-initrange, initrange)\n\n ", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "forward" }), "(", _jsx(_components.span, { className: "hljs-params", children: "self,inputs,outputs,nega_words" }), "): \n ", _jsx(_components.span, { className: "hljs-comment", children: "# inputsの単語のembedding " }), "\n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".inputs = ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".embeds_i(inputs) \n ", _jsx(_components.span, { className: "hljs-comment", children: "# 正例ラベルのスコアを算出 " }), "\n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".poji_outputs = ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".embeds_o(outputs) \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".poji_score = torch.bmm(", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".poji_outputs, ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".inputs.unsqueeze(", _jsx(_components.span, { className: "hljs-number", children: "2" }), ")).view(-", _jsx(_components.span, { className: "hljs-number", children: "1" }), ") \n ", _jsx(_components.span, { className: "hljs-comment", children: "# 負例ラベルのスコアを算出 " }), "\n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".nega_outputs = ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".embeds_o(nega_words) \n ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".nega_score = torch.bmm(", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".nega_outputs, ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".inputs.unsqueeze(", _jsx(_components.span, { className: "hljs-number", children: "2" }), ")).view(", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".poji_score),-", _jsx(_components.span, { className: "hljs-number", children: "1" }), ")\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " ", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".poji_score.sigmoid(),", _jsx(_components.span, { className: "hljs-variable language_", children: "self" }), ".nega_score.sigmoid()\n"] }) }), "\n", _jsx(_components.p, { children: "***追記 20201119***" }), "\n", _jsx(_components.p, { children: "次のコードでモデルを定義してください。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["model = word2vec_SGNS(dim=", _jsx(_components.span, { className: "hljs-number", children: "200" }), ").cuda() \noptimizer = optim.SGD(model.parameters(), lr=", _jsx(_components.span, { className: "hljs-number", children: "1" }), ")\n"] }) }), "\n", _jsx(_components.p, { children: "いよいよ学習です。**先にmodelフォルダを作成してください。**なお、RTX2080Tiで1epochあたり22分程度かかります。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: [_jsx(_components.span, { className: "hljs-keyword", children: "for" }), " epoch ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "range" }), "(", _jsx(_components.span, { className: "hljs-number", children: "20" }), "): ", _jsx(_components.span, { className: "hljs-comment", children: "#学習回数20回 " }), "\n total_loss = ", _jsx(_components.span, { className: "hljs-number", children: "0" }), " \n poji_score_mean = ", _jsx(_components.span, { className: "hljs-number", children: "0" }), " \n nega_score_mean = ", _jsx(_components.span, { className: "hljs-number", children: "0" }), " \n data_cnt = ", _jsx(_components.span, { className: "hljs-number", children: "0" }), "\n\n ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " x, y, nega_words ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " tqdm_notebook(dl): \n ", _jsx(_components.span, { className: "hljs-comment", children: "# batch_size=1の次元が余計なので、squeezeで元に戻す。 " }), "\n x = Variable(x).cuda().squeeze(", _jsx(_components.span, { className: "hljs-number", children: "0" }), ")\n y = Variable(y).cuda().squeeze(", _jsx(_components.span, { className: "hljs-number", children: "0" }), ") \n nega_words = Variable(nega_words).cuda().squeeze(", _jsx(_components.span, { className: "hljs-number", children: "0" }), ") \n \n optimizer.zero_grad() \n poji_score, nega_score = model(x, y, nega_words) \n \n ", _jsx(_components.span, { className: "hljs-comment", children: "# 実際は負例は.sum(dim=1).mean()だが、学習がnegaに偏りすぎたので単にmean()としています。 " }), "\n loss = -torch.log(poji_score+", _jsx(_components.span, { className: "hljs-number", children: "1e-10" }), ").mean() - (torch.log(", _jsx(_components.span, { className: "hljs-number", children: "1" }), " - nega_score+", _jsx(_components.span, { className: "hljs-number", children: "1e-10" }), ")).mean() \n loss.backward() \n optimizer.step() \n \n total_loss += loss.item() \n poji_score_mean += poji_score.mean() \n nega_score_mean += nega_score.mean() \n data_cnt += ", _jsx(_components.span, { className: "hljs-number", children: "1" }), "\n\n ", _jsx(_components.span, { className: "hljs-built_in", children: "print" }), "(epoch+", _jsx(_components.span, { className: "hljs-number", children: "1" }), ", ", _jsx(_components.span, { className: "hljs-string", children: "\"train_loss:\"" }), ", total_loss/data_cnt, ", _jsx(_components.span, { className: "hljs-string", children: "\"poji:\"" }), ", poji_score_mean.item()/data_cnt, ", _jsx(_components.span, { className: "hljs-string", children: "\"nega:\"" }), ", nega_score_mean.item()/data_cnt) \n torch.save(model.state_dict(), ", _jsx(_components.span, { className: "hljs-string", children: "\"./model/model{}.h5\"" }), ".", _jsx(_components.span, { className: "hljs-built_in", children: "format" }), "(epoch+", _jsx(_components.span, { className: "hljs-number", children: "1" }), "))\n"] }) }), "\n", _jsx(_components.h3, { children: "SGNSの結果を見よう" }), "\n", _jsx(_components.p, { children: "ある程度学習できたら、実際に結果を見てみましょう。もし途中のepochの学習結果を見たいのであれば、以下のコードを実行しましょう。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["epoch=", _jsx(_components.span, { className: "hljs-number", children: "20" }), " \nmodel.load_state_dict(torch.load(", _jsx(_components.span, { className: "hljs-string", children: "\"./model/model{}.h5\"" }), ".", _jsx(_components.span, { className: "hljs-built_in", children: "format" }), "(epoch)))\n"] }) }), "\n", _jsx(_components.p, { children: "次は似ている単語を見るための関数の作成です。(20201125一部追記)" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["p = model.embeds_i.weight \nc = [", _jsx(_components.span, { className: "hljs-string", children: "\"f{:03d}\"" }), ".", _jsx(_components.span, { className: "hljs-built_in", children: "format" }), "(i) ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " i ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "range" }), "(", _jsx(_components.span, { className: "hljs-number", children: "200" }), ")] \nw = [id2word[i] ", _jsx(_components.span, { className: "hljs-keyword", children: "for" }), " i ", _jsx(_components.span, { className: "hljs-keyword", children: "in" }), " ", _jsx(_components.span, { className: "hljs-built_in", children: "range" }), "(", _jsx(_components.span, { className: "hljs-built_in", children: "len" }), "(word2id))]\n\nword_df = pd.DataFrame(p.cpu().detach().numpy(), columns=c, index=w)\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "cos_sim" }), "(", _jsx(_components.span, { className: "hljs-params", children: "v1, v2" }), "):\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " np.dot(v1, v2) / np.linalg.norm(v1) / np.linalg.norm(v2)\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "w2v" }), "(", _jsx(_components.span, { className: "hljs-params", children: "w" }), "):\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " word_df.loc[w].values\n\n", _jsx(_components.span, { className: "hljs-keyword", children: "def" }), " ", _jsx(_components.span, { className: "hljs-title function_", children: "word_sim" }), "(", _jsxs(_components.span, { className: "hljs-params", children: ["v1, top=", _jsx(_components.span, { className: "hljs-number", children: "20" })] }), "): \n sim = word_df.apply(", _jsx(_components.span, { className: "hljs-keyword", children: "lambda" }), " v2: cos_sim(v1, v2), axis=", _jsx(_components.span, { className: "hljs-number", children: "1" }), ")\n ", _jsx(_components.span, { className: "hljs-keyword", children: "return" }), " sim.sort_values(ascending=", _jsx(_components.span, { className: "hljs-literal", children: "False" }), ")[:top]\n"] }) }), "\n", _jsx(_components.p, { children: "まずは、ラーメンの類似度から。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["v1 = w2v(", _jsx(_components.span, { className: "hljs-string", children: "\"ラーメン\"" }), ")\nword_sim(v1)\n"] }) }), "\n", _jsx(_components.p, { children: "結果" }), "\n", _jsx(_components.pre, { children: _jsx(_components.code, { children: "ラーメン 1.000000\n寿司 0.962305\n餃子 0.943175\nうどん 0.942448\n弁当 0.923823\n焼肉 0.912093\n丼 0.909097\n和食 0.907628\nたこ焼き 0.900868\n和菓子 0.899265\n麺 0.898703\nお好み焼き 0.894609\n土産 0.894093\n手作り 0.892730\nおでん 0.887496\n名物 0.887184\nハンバーガー 0.885918\n駄菓子 0.885593\nお菓子 0.885479\n焼きそば 0.883539\ndtype: float64\n" }) }), "\n", _jsx(_components.p, { children: "一番近いのが寿司?食べ物という観点では一緒だが、似ているかどうかと言われると怪しい。でも、餃子とか麺とかは似ているといえば似ている。 もう一つ試してみよう。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["v2 = w2v(", _jsx(_components.span, { className: "hljs-string", children: "\"楽しい\"" }), ")\nword_sim(v2)\n"] }) }), "\n", _jsx(_components.p, { children: "結果" }), "\n", _jsx(_components.pre, { children: _jsx(_components.code, { children: "楽しい 1.000000\n楽しみ 0.969969\n面白い 0.949566\n感動 0.948224\n友達 0.947869\n遊び 0.943903\n笑顔 0.939842\n観る 0.936940\n大好き 0.936386\n女の子 0.935551\n元気 0.933673\n楽しさ 0.933347\n笑い 0.933258\nすごい 0.927031\nいつも 0.923831\n思い出 0.923417\n幸せ 0.922852\n喋る 0.921187\n想い 0.920456\n優しい 0.917699\ndtype: float64\n" }) }), "\n", _jsx(_components.p, { children: "なんとなく楽しいと近い単語が並んでいるのではないか。最後に恒例の単語の足し算引き算をしてみる。" }), "\n", _jsx(_components.pre, { children: _jsxs(_components.code, { className: "hljs language-python", children: ["v3 = w2v(", _jsx(_components.span, { className: "hljs-string", children: "\"サッカー\"" }), ") \nv4 = w2v(", _jsx(_components.span, { className: "hljs-string", children: "\"蹴る\"" }), ") \nv5 = w2v(", _jsx(_components.span, { className: "hljs-string", children: "\"投げる\"" }), ")\n\nword_sim(v3-v4+v5)\n"] }) }), "\n", _jsx(_components.p, { children: "結果" }), "\n", _jsx(_components.pre, { children: _jsx(_components.code, { children: "サッカー 0.952638\nラグビー 0.928705\nバスケットボール 0.917544\n野球 0.915660\nバレーボール 0.901993\nアメリカンフットボール 0.883758\nアイスホッケー 0.883444\n陸上競技 0.870330\nフットボール 0.868901\n卓球 0.866964\nユース 0.864876\n現役時代 0.864565\nソフトボール 0.860536\nハンドボール 0.859906\nテニス 0.858122\nフットサル 0.856992\n自転車競技 0.856940\nJリーグ 0.851903\nクリケット 0.848675\n女子サッカー 0.844556\ndtype: float64\n" }) }), "\n", _jsx(_components.p, { children: "サッカーが1番上に来てしまっているが、ラグビー、バスケ、野球など、ボールを投げるスポーツが上位に来ている。" }), "\n", _jsx(_components.p, { children: "SGNSの実装はこれで終了です。windowの数やoptimizer等をしっかりいじれば、もっと精度は高くなっていくと思いますので、ぜひ試してみてください!" })] }); } function MDXContent(props = {}) { const {wrapper: MDXLayout} = Object.assign({}, _provideComponents(), props.components); return MDXLayout ? _jsx(MDXLayout, Object.assign({}, props, { children: _jsx(_createMdxContent, props) })) : _createMdxContent(props); } return { default: MDXContent }; a:["$","div",null,{"className":"container mx-auto px-4 py-8 max-w-7xl","children":["$","div",null,{"className":"flex flex-col md:flex-row gap-8","children":[["$","article",null,{"className":"w-full md:w-[70%] bg-white shadow-lg rounded-lg p-8","children":[["$","header",null,{"className":"mb-8 border-b pb-4","children":[["$","div",null,{"className":"flex items-center gap-2 mb-2","children":[["$","$L6",null,{"href":"/","className":"text-gray-500 hover:text-orange-500","children":"Home"}],["$","span",null,{"className":"text-gray-300","children":">"}],["$","span",null,{"className":"text-gray-500","children":"$undefined"}]]}],["$","h1",null,{"className":"text-3xl font-bold mb-4","children":"word2vec(Skip-Gram with Negative Sampling)の理論と実装2"}],["$","div",null,{"className":"text-gray-500 flex flex-wrap gap-4 items-center mb-6","children":[["$","time",null,{"className":"flex items-center gap-1 text-sm","children":[["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","height":"18px","viewBox":"0 0 24 24","width":"18px","fill":"#666666","className":"mr-1","children":[["$","path",null,{"d":"M0 0h24v24H0V0z","fill":"none"}],["$","path",null,{"d":"M19 3h-1V1h-2v2H8V1H6v2H5c-1.11 0-2 .9-2 2v14c0 1.1.89 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm0 16H5V8h14v11zM7 10h5v5H7z"}]]}],"日付: ","2020-01-12"]}],["$","div",null,{"className":"flex flex-wrap gap-2","children":[["$","$L6","自然言語処理",{"href":"/tags/自然言語処理","className":"bg-gray-100 hover:bg-blue-100 px-2 py-1 rounded text-sm text-gray-600 hover:text-blue-600 transition-colors","children":["#","自然言語処理"]}]]}]]}]]}],["$","div",null,{"className":"prose max-w-none prose-headings:border-b prose-headings:pb-2 prose-a:text-orange-600","children":["$","$Lb",null,{"compiledSource":"$c","frontmatter":{},"scope":{}}]}],["$","div",null,{"className":"flex justify-between items-center mt-12","children":[["$","div",null,{"className":"flex-1","children":["$","$L6",null,{"href":"/blog18","className":"group flex flex-col items-start p-4 rounded-lg border border-gray-100 hover:border-orange-200 hover:bg-orange-50 transition-all","children":[["$","span",null,{"className":"text-sm text-gray-500 mb-1 group-hover:text-orange-600 flex items-center","children":[["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","className":"h-4 w-4 mr-1","fill":"none","viewBox":"0 0 24 24","stroke":"currentColor","children":["$","path",null,{"strokeLinecap":"round","strokeLinejoin":"round","strokeWidth":2,"d":"M15 19l-7-7 7-7"}]}],"Previous"]}],["$","span",null,{"className":"font-medium text-gray-800 dark:text-gray-200 line-clamp-2","children":"word2vec(Skip-Gram with Negative Sampling)の理論と実装1"}]]}]}],["$","div",null,{"className":"w-8"}],["$","div",null,{"className":"flex-1 text-right","children":["$","$L6",null,{"href":"/blog20","className":"group flex flex-col items-end p-4 rounded-lg border border-gray-100 hover:border-orange-200 hover:bg-orange-50 transition-all","children":[["$","span",null,{"className":"text-sm text-gray-500 mb-1 group-hover:text-orange-600 flex items-center","children":["Next",["$","svg",null,{"xmlns":"http://www.w3.org/2000/svg","className":"h-4 w-4 ml-1","fill":"none","viewBox":"0 0 24 24","stroke":"currentColor","children":["$","path",null,{"strokeLinecap":"round","strokeLinejoin":"round","strokeWidth":2,"d":"M9 5l7 7-7 7"}]}]]}],["$","span",null,{"className":"font-medium text-gray-800 dark:text-gray-200 line-clamp-2","children":"Kaggle奮闘記 〜Kuzushiji Recognition2〜"}]]}]}]]}],["$","$Ld",null,{}]]}],["$","div",null,{"className":"w-full md:w-[30%]","children":"$Le"}]]}]}] 9:null f:I[9009,["326","static/chunks/326-ead410bae2047633.js","986","static/chunks/986-f27c5a2c4d841870.js","42","static/chunks/app/%5Bslug%5D/page-419f452b4066bb25.js"],""] e:["$","aside",null,{"className":"w-full flex flex-col gap-6","children":[["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"プロフィール"}],["$","div",null,{"className":"flex items-center mb-4","children":[["$","$Lf",null,{}],["$","span",null,{"className":"font-bold","children":"Shingo.S"}]]}],["$","p",null,{"className":"mb-4 text-base leading-relaxed text-gray-600","children":["データサイエンティストとして働いています。仕事では主にPythonやSAS、Rを用いて分析しています。",["$","br",null,{}],["$","br",null,{}],"統計検定1級、Kaggle Expert。",["$","br",null,{}],"自然言語処理に興味があります。"]}],["$","div",null,{"className":"mt-4 flex items-center","children":["$","a",null,{"href":"https://x.com/shingo97358922","target":"_blank","rel":"noopener noreferrer","className":"flex items-center gap-2 text-gray-600 hover:text-black transition-colors","children":[["$","svg",null,{"className":"w-5 h-5","fill":"currentColor","viewBox":"0 0 24 24","aria-hidden":"true","children":["$","path",null,{"d":"M18.244 2.25h3.308l-7.227 8.26 8.502 11.24H16.17l-5.214-6.817L4.99 21.75H1.68l7.73-8.835L1.254 2.25H8.08l4.713 6.231zm-1.161 17.52h1.833L7.084 4.126H5.117z"}]}],["$","span",null,{"className":"text-sm font-medium","children":"@shingo97358922"}]]}]}]]}],["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"アクセスカウンター"}],["$","div",null,{"className":"flex flex-col gap-2 text-gray-600 pl-2","children":[["$","div",null,{"className":"flex items-center gap-4","children":[["$","span",null,{"className":"w-12 text-right","children":"累計"}],["$","span",null,{"children":":"}],["$","span",null,{"className":"font-mono text-lg font-bold","children":"72320"}]]}],["$","div",null,{"className":"flex items-center gap-4","children":[["$","span",null,{"className":"w-12 text-right","children":"本日"}],["$","span",null,{"children":":"}],["$","span",null,{"className":"font-mono text-lg font-bold","children":"17"}]]}],["$","div",null,{"className":"flex items-center gap-4","children":[["$","span",null,{"className":"w-12 text-right","children":"昨日"}],["$","span",null,{"children":":"}],["$","span",null,{"className":"font-mono text-lg font-bold","children":"33"}]]}]]}]]}],["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"月別アーカイブ"}],["$","ul",null,{"className":"text-base text-gray-600","children":[["$","li","2026年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2026-02","className":"hover:text-blue-500 transition-colors block","children":["2026年2月"," (",1,")"]}]}],["$","li","2025年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2025-02","className":"hover:text-blue-500 transition-colors block","children":["2025年2月"," (",1,")"]}]}],["$","li","2024年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2024-10","className":"hover:text-blue-500 transition-colors block","children":["2024年10月"," (",1,")"]}]}],["$","li","2024年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2024-07","className":"hover:text-blue-500 transition-colors block","children":["2024年7月"," (",1,")"]}]}],["$","li","2024年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2024-02","className":"hover:text-blue-500 transition-colors block","children":["2024年2月"," (",1,")"]}]}],["$","li","2023年6月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2023-06","className":"hover:text-blue-500 transition-colors block","children":["2023年6月"," (",1,")"]}]}],["$","li","2023年5月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2023-05","className":"hover:text-blue-500 transition-colors block","children":["2023年5月"," (",1,")"]}]}],["$","li","2023年3月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2023-03","className":"hover:text-blue-500 transition-colors block","children":["2023年3月"," (",2,")"]}]}],["$","li","2022年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2022-10","className":"hover:text-blue-500 transition-colors block","children":["2022年10月"," (",2,")"]}]}],["$","li","2022年9月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2022-09","className":"hover:text-blue-500 transition-colors block","children":["2022年9月"," (",2,")"]}]}],["$","li","2022年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2022-07","className":"hover:text-blue-500 transition-colors block","children":["2022年7月"," (",1,")"]}]}],["$","li","2022年3月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2022-03","className":"hover:text-blue-500 transition-colors block","children":["2022年3月"," (",1,")"]}]}],["$","li","2021年11月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-11","className":"hover:text-blue-500 transition-colors block","children":["2021年11月"," (",1,")"]}]}],["$","li","2021年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-10","className":"hover:text-blue-500 transition-colors block","children":["2021年10月"," (",2,")"]}]}],["$","li","2021年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-07","className":"hover:text-blue-500 transition-colors block","children":["2021年7月"," (",2,")"]}]}],["$","li","2021年5月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-05","className":"hover:text-blue-500 transition-colors block","children":["2021年5月"," (",2,")"]}]}],["$","li","2021年4月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-04","className":"hover:text-blue-500 transition-colors block","children":["2021年4月"," (",2,")"]}]}],["$","li","2021年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2021-02","className":"hover:text-blue-500 transition-colors block","children":["2021年2月"," (",1,")"]}]}],["$","li","2020年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-10","className":"hover:text-blue-500 transition-colors block","children":["2020年10月"," (",1,")"]}]}],["$","li","2020年9月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-09","className":"hover:text-blue-500 transition-colors block","children":["2020年9月"," (",1,")"]}]}],["$","li","2020年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-07","className":"hover:text-blue-500 transition-colors block","children":["2020年7月"," (",1,")"]}]}],["$","li","2020年6月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-06","className":"hover:text-blue-500 transition-colors block","children":["2020年6月"," (",1,")"]}]}],["$","li","2020年5月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-05","className":"hover:text-blue-500 transition-colors block","children":["2020年5月"," (",1,")"]}]}],["$","li","2020年4月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-04","className":"hover:text-blue-500 transition-colors block","children":["2020年4月"," (",1,")"]}]}],["$","li","2020年3月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-03","className":"hover:text-blue-500 transition-colors block","children":["2020年3月"," (",1,")"]}]}],["$","li","2020年2月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-02","className":"hover:text-blue-500 transition-colors block","children":["2020年2月"," (",1,")"]}]}],["$","li","2020年1月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2020-01","className":"hover:text-blue-500 transition-colors block","children":["2020年1月"," (",1,")"]}]}],["$","li","2019年12月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-12","className":"hover:text-blue-500 transition-colors block","children":["2019年12月"," (",1,")"]}]}],["$","li","2019年11月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-11","className":"hover:text-blue-500 transition-colors block","children":["2019年11月"," (",2,")"]}]}],["$","li","2019年10月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-10","className":"hover:text-blue-500 transition-colors block","children":["2019年10月"," (",2,")"]}]}],["$","li","2019年9月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-09","className":"hover:text-blue-500 transition-colors block","children":["2019年9月"," (",1,")"]}]}],["$","li","2019年8月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-08","className":"hover:text-blue-500 transition-colors block","children":["2019年8月"," (",8,")"]}]}],["$","li","2019年7月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-07","className":"hover:text-blue-500 transition-colors block","children":["2019年7月"," (",1,")"]}]}],["$","li","2019年6月",{"className":"mb-2 border-b border-dashed border-gray-200 pb-1 last:border-none","children":["$","$L6",null,{"href":"/archives/2019-06","className":"hover:text-blue-500 transition-colors block","children":["2019年6月"," (",2,")"]}]}]]}]]}],["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"タグ一覧"}],["$","div",null,{"className":"flex flex-wrap gap-2","children":[["$","$L6","データ分析",{"href":"/tags/データ分析","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["データ分析"," (",18,")"]}],["$","$L6","自然言語処理",{"href":"/tags/自然言語処理","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["自然言語処理"," (",16,")"]}],["$","$L6","SAS",{"href":"/tags/SAS","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["SAS"," (",12,")"]}],["$","$L6","Kaggle",{"href":"/tags/Kaggle","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["Kaggle"," (",3,")"]}],["$","$L6","雑談",{"href":"/tags/雑談","className":"bg-gray-100 hover:bg-blue-500 hover:text-white rounded px-2 py-1 text-sm transition-colors duration-200 text-gray-600","children":["雑談"," (",2,")"]}]]}]]}],["$","div",null,{"className":"bg-white p-6 rounded shadow-sm border","children":[["$","h3",null,{"className":"font-bold text-lg mb-4 text-[#494949] border-b pb-2","children":"広告枠"}],["$","div",null,{"className":"flex justify-center items-center","children":[["$","a",null,{"href":"https://px.a8.net/svt/ejp?a8mat=4AXI0F+CULTTE+348+6CWQP","rel":"nofollow","target":"_blank","children":["$","img",null,{"style":{"border":0},"width":"250","height":"250","alt":"","src":"https://www25.a8.net/svt/bgt?aid=260223855777&wid=001&eno=01&mid=s00000000404001068000&mc=1"}]}],["$","img",null,{"style":{"border":0},"width":"1","height":"1","src":"https://www17.a8.net/0.gif?a8mat=4AXI0F+CULTTE+348+6CWQP","alt":""}]]}]]}]]}]