{"id":165,"date":"2009-01-13T17:35:37","date_gmt":"2009-01-13T17:35:37","guid":{"rendered":"http:\/\/newblog.mix1009.net\/?p=165"},"modified":"2009-01-13T17:35:37","modified_gmt":"2009-01-13T17:35:37","slug":"jquery-jaxer-html-parsing","status":"publish","type":"post","link":"https:\/\/mix1009.net\/?p=165","title":{"rendered":"jQuery, Jaxer, HTML Parsing"},"content":{"rendered":"<p><a href=\"http:\/\/jquery.com\">jQuery<\/a>\ub97c \ubcf8\uaca9\uc801(?)\uc73c\ub85c \uc4f0\uae30 \uc2dc\uc791\ud588\ub294\ub370 \uc4f8\uc218\ub85d \ub9c8\uc74c\uc5d0 \ub4dc\ub124\uc694. \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8 \ub2f5\uc9c0 \uc54a\uc740 \ubb38\ubc95, selector\ub97c \ud1b5\ud55c \uc790\uc720\ub85c\uc6b4 element \uc120\ud0dd, \ud55c\ubc88\uc5d0 \uc5ec\ub7ec element\uc5d0 \uc791\uc5c5\ud560\uc218 \uc788\ub294 \ud2b9\uc9d5. \uc5b4\ub5bb\uac8c \uc774\ub7f0 \ubb38\ubc95\uc73c\ub85c \ub77c\uc774\ube0c\ub7ec\ub9ac\ub97c \ub9cc\ub4e4 \uc0dd\uac01\uc744 \ud588\ub294\uc9c0, \uadf8\ub9ac\uace0 \uc774\ub807\uac8c \uc798 \uad6c\ud604\ud588\ub294\uc9c0 \uc5ec\ub7ec\uac00\uc9c0 \uc0dd\uac01\uc744 \ud558\uac8c\ub054\ud558\ub294 \ub77c\uc774\ube0c\ub7ec\ub9ac\uc785\ub2c8\ub2e4.<\/p>\n<p>\uc6f9\ud504\ub85c\uadf8\ub798\ubc0d\uc744 \ud558\uba74 MVC \ubd84\ub9ac\uac00 \uc798 \uc548\ub410\uc5c8\ub294\ub370 jQuery\ub97c \uc774\uc6a9\ud558\uba74 \uc774\uac8c \uc5b4\ub290\uc815\ub3c4 \uac00\ub2a5\ud55c\uac70 \uac19\ub124\uc694. \uc77c\uc8fc\uc77c\ub3d9\uc548 \uc0ac\uc6a9\ud558\ub294\ub370 \ub108\ubb34 \ub9c8\uc74c\uc5d0 \ub4e4\uc5b4\uc11c \uc11c\ubc84\ucabd\uc5d0\uc11c\ub3c4 \uc774\ub7f0 \uc791\uc5c5\uc744 \ud558\uba74 \ud3b8\ud560\uac70 \uac19\uc740 \uc0dd\uac01\uc774 \ub4e4\uc5b4\uc11c \uc880 \ucc3e\uc544\ubd24\uc2b5\ub2c8\ub2e4.<\/p>\n<p>\uba3c\uc800 \uc790\ubc14\ub85c \uc9e0 \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8 \uc5d4\uc9c4\uc778 <a href=\"http:\/\/ejohn.org\/projects\/bringing-the-browser-to-the-server\/\">Rhino \uc704\uc5d0\uc11c \uc5ec\ub7ec\uac00\uc9c0 \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8 \ub77c\uc774\ube0c\ub7ec\ub9ac(jquery, prototype, mochkit)\ub97c \ub3cc\ub9b0 \uc2dc\ub3c4<\/a>\uac00 \uc788\uc5b4\uc11c \uc800\ub3c4 \ub530\ub77c\ud574\ubd24\uc9c0\ub9cc \ubb50\uac00 \ubb38\uc81c\uc778\uc9c0 \uc815\uc0c1\ub3d9\uc791\ud558\uc9c0 \uc54a\ub354\uad70\uc694. \uc608\uc81c\uc5d0 \ubcf4\uba74 \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8\ub85c \uc11c\ubc84\ucabd \uc2a4\ud06c\ub9bd\ud305\uc744 \ud558\ub294\ub370, \uc77c\ub2e8 \ud654\uba74\uc5d0 \ucd9c\ub825\ud558\ub294\uac70\ubd80\ud130 \uc775\uc219\ud558\uc9c0\uac00 \uc54a\uc73c\ub2c8, \ubb50\uac00 \uc798\ubabb\ub410\ub294\uc9c0 \ucc3e\uc544\ubcf4\uae30\ub3c4 \ud798\ub4e4\ub354\uad70\uc694. \uae00 \uc62c\ub77c\uc628\uc9c0 1\ub144\uc774 \ub118\uc5b4\uc11c \ubc84\uc804 \ucc28\uc774\ub54c\ubb38\uc5d0 \uadf8\ub7f0\uac74\uc9c0&#8230; \uc790\ubc14\uc758 \ubc84\uc804\ub54c\ubb38\uc5d0 \uadf8\ub7f0\uac74\uc9c0&#8230; \ud558\uc5ec\uac04 \uc911\uac04\uc5d0 \ud3ec\uae30!<\/p>\n<p>\ubb38\uc81c \ud574\uacb0\uc744 \uc704\ud574\uc11c \uc6f9\uc11c\ud551\ud558\ub2e4\uac00 <a href=\"http:\/\/www.aptana.com\/jaxer\">Aptana Jaxer<\/a>\uc5d0 \ub300\ud574\uc11c \uc54c\uac8c\ub418\uc5c8\uc2b5\ub2c8\ub2e4. \uc804\uc5d0 \uc774\ub984\ub9cc \ub4e4\uc5b4\ubd24\uc5c8\ub294\ub370, \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8\ub85c \uc11c\ubc84\ub2e8\uc5d0\uc11c\ub3c4 \ud504\ub85c\uadf8\ub798\ubc0d\ud560\uc218 \uc788\uac8c \ud558\ub294 \ud504\ub85c\uc81d\ud2b8\uc785\ub2c8\ub2e4. \ub2e8\uc21c\ud788 HTML\uc5d0\uc11c script\uc5d0 runat=&#8221;server&#8221;\ub85c \ub2ec\uc544\uc8fc\uba74, \ud574\ub2f9 \ubd80\ubd84\uc774 \uc11c\ubc84\ub2e8\uc5d0\uc11c \ucc98\ub9ac\ud55c \ub2e4\uc74c\uc5d0 \uacb0\uacfc\uac00 \ud074\ub77c\uc774\uc5b8\ud2b8\uc5d0 \uc804\ub2ec\ub429\ub2c8\ub2e4. \ud074\ub77c\uc774\uc5b8\ud2b8 \ub2e8\uc5d0\uc11c \uc798 \ub3cc\uc544\uac00\ub294 \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8\ub97c \uc11c\ubc84\ub2e8\uc5d0\uc11c \uc218\uc815\uc5c6\uc774 \ubc14\ub85c \ub3cc\ub9b4\uc218 \uc788\uc2b5\ub2c8\ub2e4. jQuery\ub4f1\uc758 \ub77c\uc774\ube0c\ub7ec\ub9ac\ub3c4 \uc11c\ubc84\ub2e8\uc5d0\uc11c \uc798 \ub3cc\uc544\uac11\ub2c8\ub2e4:) \uc5ec\uae30\uc11c \ub354 \ub098\uc544\uac00 \uc11c\ubc84\uc640 \ud074\ub77c\uc774\uc5b8\ud2b8 \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8\uac04 \ud1b5\uc2e0\uc744 \ud560\uc218 \uc788\ub294 \ubc29\ubc95\uc744 \ub9cc\ub4e0\uac70 \uac19\ub124\uc694. \uc11c\ubc84\ub2e8 \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8\uc5d0\uc11c\ub294 \ub514\ube44\uc811\uc18d, \ud30c\uc77c\uc811\uadfc \ub4f1 \uc11c\ubc84\ub2e8 \uc5b8\uc5b4\uc5d0\uc11c \ud560\uc218 \uc788\ub294 \uae30\ub2a5\ub4e4\uc744 \ub77c\uc774\ube0c\ub7ec\ub9ac\ub85c \uc81c\uacf5\ud569\ub2c8\ub2e4. \uac04\ub2e8\ud788 \ud074\ub77c\uc774\uc5b8\ud2b8 \ub2e8\uc5d0\uc11c \ud558\ub358 \uc77c \uc815\ub3c4\ub294 \uac04\ud3b8\ud558\uac8c \uc11c\ubc84\ub2e8\uc5d0\uc11c \ucc98\ub9ac\ud560\uc218 \uc788\uc9c0\ub9cc, \uc6f9\uc11c\ubc84\ub3c4 \ub530\ub85c \ub744\uc6cc\uc57c\ud558\uace0, \ub2e4\ub978 \uc11c\ubc84\ub2e8 \uc5b8\uc5b4\uc640\uc758 \uc5f0\ub3d9\uc744 \uc0dd\uac01\ud558\ub2c8, \uc11c\ubc84\ub2e8\uc5d0\uc11c jQuery\ub97c \ub3cc\ub9b0 \uacb0\uacfc\uac12\ub9cc HTML\ub85c \ubc1b\uc544\uc11c \uc800\uc7a5\ud558\ub294\uac8c \ub354 \ud3b8\ud560\uac83 \uac19\ub2e4\ub294 \uc0dd\uac01\uc774 \ub4e4\uc5b4\uc11c Jaxer \ub3c4\uc785\uc740 \ubcf4\ub958\ud588\uc2b5\ub2c8\ub2e4.<\/p>\n<p>jQuery\ucc98\ub7fc \ud3b8\ud558\uac8c HTML \ub370\uc774\ud0c0\uc5d0 \uc811\uadfc\ud574\uc11c \ub370\uc774\ud0c0\ub97c \ucd94\ucd9c\ud558\uace0, \ubcc0\uacbd\ud560\uc218 \uc788\ub294 \ubc29\ubc95\uc744 \ucc3e\uc544\ubd24\uc2b5\ub2c8\ub2e4. jQuery\uc758 selector\ub098 XPath\ub4f1\uc73c\ub85c HTML\uc744 \ud30c\uc2f1\ud560\uc218 \uc788\uc73c\uba74 \uc88b\uc744\uac70\ub77c \uc0dd\uac01\ud558\uace0 \uac80\uc0c9\ud574\ubcf4\ub2c8 libxml2\uc5d0\uc11c HTML \ud30c\uc2f1\uc744 \uc9c0\uc6d0\ud558\ub354\uad70\uc694:) python binding\uc774 \uc788\uace0, \uadf8\uc804\uc5d0 <a href=\"http:\/\/mix1009.net\/87\">XML\uc5d0\uc11c\ub294 XPath\ub97c \uc774\uc6a9\ud55c \uacbd\ud5d8<\/a>\ub3c4 \uc788\uc73c\ub2c8 \uae08\ubc29 \ub420\ub4ef\ud558\ub354\uad70\uc694. jQuery\ub85c \uc9e0 \ub77c\uc778\ub4e4\uc744 \ubcf5\uc0ac\ud558\uace0, \uadf8\ub300\ub85c XPath\uc640 libxml2 \ub77c\uc774\ube0c\ub7ec\ub9ac \ud568\uc218\ub4e4\ub85c \uc62e\uacbc\uc2b5\ub2c8\ub2e4. \ub77c\uc778\uc218\ub294 \ub9ce\uc774 \uae38\uc5b4\uc84c\uc9c0\ub9cc \uadf8\ub798\ub3c4 \uc18c\uc2a4\ub294 \ubcfc\ub9cc\ud558\uace0, selector\ub85c\ub294 \ubd88\uac00\ub2a5\ud55c \ubd80\ubd84\ub4e4\ub3c4 \uc27d\uac8c \uad6c\ud604\ud560\uc218 \uc788\ub2e4\ub294 \uc0dd\uac01\uc774 \ub4e4\ub354\uad70\uc694. <\/p>\n<p>[code type=javascript]<br \/>\n &nbsp;&nbsp; $(&#8216;a[href^=\/]&#8217;).each(function(i) {$(this).replaceWith($(this).text()); } );<br \/>\n &nbsp;&nbsp; $(&#8216;a[href^=#cite]&#8217;).remove();<br \/>\n &nbsp;&nbsp; $(&#8216;span[class=editsection]&#8217;).remove();<br \/>\n  &nbsp;&nbsp; $(&#8216;table[class^=navbox ]&#8217;).remove();<br \/>\n &nbsp;&nbsp; $(&#8216;div[class=\uc8fc\uc11d]&#8217;).remove();<br \/>\n &nbsp;&nbsp; $(&#8216;table[id=toc]&#8217;).remove();<br \/>\n &nbsp;&nbsp; $(&#8216;h2 &gt; span&#8217;).each(function(i) { if ($(this).text() == &#8220;\uc8fc\uc11d&#8221;) $(this).remove(); } );<br \/>\n [\/code]<br \/>\n [code type=python]<br \/>\n &nbsp;&nbsp; try:<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; hdoc = libxml2.htmlReadFile(filename, None, parse_options)<br \/>\n &nbsp;&nbsp; except Exception, e:<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; print e<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; return<\/p>\n<p> &nbsp;&nbsp; #$(&#8216;a[href^=\/]&#8217;).each(function(i) {$(this).replaceWith($(this).text()); } );<br \/>\n &nbsp;&nbsp; #$(&#8216;a[href^=#cite]&#8217;).remove();<br \/>\n &nbsp;&nbsp; for e in hdoc.xpathEval(&#8216;\/\/a&#8217;):<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; href = e.prop(&#8220;href&#8221;)<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; if not href: continue<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; if href.find(&#8220;\/&#8221;)==0:<br \/>\n &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; node = libxml2.newText(e.content)<br \/>\n &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; e.replaceNode(node)<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; elif href.find(&#8220;#cite&#8221;)==0:<br \/>\n &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; e.unlinkNode()<\/p>\n<p> &nbsp;&nbsp; #$(&#8216;span[class=editsection]&#8217;).remove();<br \/>\n &nbsp;&nbsp; for e in hdoc.xpathEval(&#8216;\/\/span[@class=&#8221;editsection&#8221;]&#8217;): e.unlinkNode()<\/p>\n<p> &nbsp;&nbsp; #$(&#8216;table[class^=navbox ]&#8217;).remove();<br \/>\n &nbsp;&nbsp; for e in hdoc.xpathEval(&#8216;\/\/table&#8217;):<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; cl = e.prop(&#8220;class&#8221;)<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; if not cl: continue<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; if cl.split(&#8221; &#8220;).count(&#8220;navbox&#8221;)&gt;0:<br \/>\n &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; e.unlinkNode()<\/p>\n<p> &nbsp;&nbsp; #$(&#8216;div[class=\uc8fc\uc11d]&#8217;).remove();<br \/>\n &nbsp;&nbsp; for e in hdoc.xpathEval(&#8216;\/\/div[@class=&#8221;\uc8fc\uc11d&#8221;]&#8217;): e.unlinkNode()<\/p>\n<p> &nbsp;&nbsp; #$(&#8216;table[id=toc]&#8217;).remove();<br \/>\n &nbsp;&nbsp; for e in hdoc.xpathEval(&#8216;\/\/table[@id=&#8221;toc&#8221;]&#8217;): e.unlinkNode()<\/p>\n<p> &nbsp;&nbsp; #$(&#8216;h2 &gt; span&#8217;).each(function(i) { if ($(this).text() == &#8220;\uc8fc\uc11d&#8221;) $(this).remove(); } );<br \/>\n &nbsp;&nbsp; for e in hdoc.xpathEval(&#8216;\/\/h2\/span&#8217;):<br \/>\n &nbsp; &nbsp; &nbsp;&nbsp; if e.content == &#8220;\uc8fc\uc11d&#8221;:<br \/>\n &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; e.unlinkNode()<br \/>\n[\/code]<\/p>\n<p>libxml2 python binding\uc758 \ubb38\uc81c\uc778\uc9c0 \ubaa8\ub974\uaca0\uc9c0\ub9cc, htmlParseDoc()\uc744 \uc0ac\uc6a9\ud558\uba74 xpath\uac00 \ub3d9\uc791\ud558\uc9c0 \uc54a\uace0, htmlParseFile\uc744 \uc0ac\uc6a9\ud558\uba74 html \ud30c\uc2f1 \uad00\ub828 \uacbd\uace0\uac00 \ucd9c\ub825\ub418\ub294\ub370 \ucd9c\ub825\uc548\ub418\uac8c \ud560 \ubc29\ubc95\uc774 \uc5c6\ub294\ub4ef\ud558\ub124\uc694.<\/p>\n<p>\uc704 \uc18c\uc2a4\ub294 \uc704\ud0a4\ud53c\ub514\uc544\uc5d0\uc11c \uae00\uc744 \uc77d\uc5b4\uc11c \ud544\uc694\uc5c6\ub294 \ubd80\ubd84\uc744 \uc81c\uac70\ud558\ub294 \uc18c\uc2a4\uc774\uace0, \uc774\ud6c4\ub85c \uc5ec\ub7ec\uac00\uc9c0 \uc694\uad6c\uc0ac\ud56d\ub4e4\uc774 \uc0dd\uaca8\uc11c \ub9ce\uc774 \ud655\uc7a5\ub410\uc2b5\ub2c8\ub2e4.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>jQuery\ub97c \ubcf8\uaca9\uc801(?)\uc73c\ub85c \uc4f0\uae30 \uc2dc\uc791\ud588\ub294\ub370 \uc4f8\uc218\ub85d \ub9c8\uc74c\uc5d0 \ub4dc\ub124\uc694. \uc790\ubc14\uc2a4\ud06c\ub9bd\ud2b8 \ub2f5\uc9c0 \uc54a\uc740 \ubb38\ubc95, selector\ub97c \ud1b5\ud55c \uc790\uc720\ub85c\uc6b4 element \uc120\ud0dd, \ud55c\ubc88\uc5d0 \uc5ec\ub7ec element\uc5d0 \uc791\uc5c5\ud560\uc218 \uc788\ub294 \ud2b9\uc9d5. \uc5b4\ub5bb\uac8c \uc774\ub7f0 \ubb38\ubc95\uc73c\ub85c \ub77c\uc774\ube0c\ub7ec\ub9ac\ub97c \ub9cc\ub4e4 \uc0dd\uac01\uc744 \ud588\ub294\uc9c0, \uadf8\ub9ac\uace0 \uc774\ub807\uac8c \uc798 \uad6c\ud604\ud588\ub294\uc9c0 \uc5ec\ub7ec\uac00\uc9c0 \uc0dd\uac01\uc744 \ud558\uac8c\ub054\ud558\ub294 \ub77c\uc774\ube0c\ub7ec\ub9ac\uc785\ub2c8\ub2e4. \uc6f9\ud504\ub85c\uadf8\ub798\ubc0d\uc744 \ud558\uba74 MVC \ubd84\ub9ac\uac00 \uc798 \uc548\ub410\uc5c8\ub294\ub370 jQuery\ub97c \uc774\uc6a9\ud558\uba74 \uc774\uac8c \uc5b4\ub290\uc815\ub3c4 \uac00\ub2a5\ud55c\uac70 \uac19\ub124\uc694. \uc77c\uc8fc\uc77c\ub3d9\uc548 \uc0ac\uc6a9\ud558\ub294\ub370 \ub108\ubb34 \ub9c8\uc74c\uc5d0 \ub4e4\uc5b4\uc11c \uc11c\ubc84\ucabd\uc5d0\uc11c\ub3c4 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[19],"tags":[146,147,148,91],"class_list":["post-165","post","type-post","status-publish","format-standard","hentry","category-19","tag-html-parsing","tag-jaxer","tag-jquery","tag-libxml2"],"_links":{"self":[{"href":"https:\/\/mix1009.net\/index.php?rest_route=\/wp\/v2\/posts\/165","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/mix1009.net\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/mix1009.net\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/mix1009.net\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/mix1009.net\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=165"}],"version-history":[{"count":0,"href":"https:\/\/mix1009.net\/index.php?rest_route=\/wp\/v2\/posts\/165\/revisions"}],"wp:attachment":[{"href":"https:\/\/mix1009.net\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=165"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/mix1009.net\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=165"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/mix1009.net\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=165"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}