`
deepfuture
  • 浏览: 4332300 次
  • 性别: Icon_minigender_1
  • 来自: 湛江
博客专栏
073ec2a9-85b7-3ebf-a3bb-c6361e6c6f64
SQLite源码剖析
浏览量:79402
1591c4b8-62f1-3d3e-9551-25c77465da96
WIN32汇编语言学习应用...
浏览量:68349
F5390db6-59dd-338f-ba18-4e93943ff06a
神奇的perl
浏览量:101473
Dac44363-8a80-3836-99aa-f7b7780fa6e2
lucene等搜索引擎解析...
浏览量:281118
Ec49a563-4109-3c69-9c83-8f6d068ba113
深入lucene3.5源码...
浏览量:14595
9b99bfc2-19c2-3346-9100-7f8879c731ce
VB.NET并行与分布式编...
浏览量:65542
B1db2af3-06b3-35bb-ac08-59ff2d1324b4
silverlight 5...
浏览量:31309
4a56b548-ab3d-35af-a984-e0781d142c23
算法下午茶系列
浏览量:45189
社区版块
存档分类
最新评论

lucene-使用htmlparser提取网页特定链接

阅读更多

1、以EMAIL为例:(以这个网页为例http://www.qunar.com/site/zh/Cooperate_4.shtml)

package extract;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class extracthtmllinkemail {

/**
* @param args
*/
public static String getText(String f){
StringBuffer sb=new StringBuffer();
try {
Parser parser=new Parser(f);
parser.setEncoding("UTF-8");
NodeFilter filter=new NodeClassFilter(){
public boolean accept(Node node){
return (LinkTag.class.isAssignableFrom(node.getClass())&&((LinkTag)node).isMailLink());
}
};

NodeList links=new NodeList();
for (NodeIterator e=parser.elements();e.hasMoreNodes();){
e.nextNode().collectInto(links, filter);
}
for (int i=0;i<links.size();i++){
LinkTag linktag=(LinkTag)links.elementAt(i);
sb.append("\n|"+linktag.getLinkText()+"|=>"+linktag.getLink());
}

} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String s=getText("./htmls/Qunar_com.htm");
System.out.print(s);
}
}
效果如下:


|Adservice@qunar.com|=>adservice@qunar.com
|adservice@qunar.com|=>adservice@qunar.com

2、以(www.hao123.com)为例,提取js链接

代码

package extract;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class extracthtmllinkemail {

/**
* @param args
*/
public static String getText(String f){
StringBuffer sb=new StringBuffer();
try {
Parser parser=new Parser(f);
parser.setEncoding("GB2312");
NodeFilter filter=new NodeClassFilter(){
public boolean accept(Node node){
return (LinkTag.class.isAssignableFrom(node.getClass())&&((LinkTag)node).isJavascriptLink());
}
};

NodeList links=new NodeList();
for (NodeIterator e=parser.elements();e.hasMoreNodes();){
e.nextNode().collectInto(links, filter);
}
for (int i=0;i<links.size();i++){
LinkTag linktag=(LinkTag)links.elementAt(i);
sb.append("\n|"+linktag.getLinkText()+"|=>"+linktag.getLink());
}

} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String s=getText("./htmls/hao123.htm");
System.out.print(s);
}
}
效果如下:


|我的hao123|=>void(0);
|我喜欢的网站:|=>void(0)
|我定制的网站:|=>void(0)

分享到:
评论

相关推荐

    lucene-analyzers-smartcn-7.7.0-API文档-中文版.zip

    赠送jar包:lucene-analyzers-smartcn-7.7.0.jar; 赠送原API文档:lucene-analyzers-smartcn-7.7.0-javadoc.jar; 赠送源代码:lucene-analyzers-smartcn-7.7.0-sources.jar; 赠送Maven依赖信息文件:lucene-...

    lucene-core-2.9.4,lucene-core-3.0.2,lucene-core-3.0.3,lucene-core-3.4.0

    lucene-core-2.9.4,lucene-core-3.0.2,lucene-core-3.0.3,lucene-core-3.4.0

    lucene-core-7.7.0-API文档-中文版.zip

    赠送jar包:lucene-core-7.7.0.jar; 赠送原API文档:lucene-core-7.7.0-javadoc.jar; 赠送源代码:lucene-core-7.7.0-sources.jar; 赠送Maven依赖信息文件:lucene-core-7.7.0.pom; 包含翻译后的API文档:lucene...

    lucene-analyzers-common-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-analyzers-common-6.6.0.jar; 赠送原API文档:lucene-analyzers-common-6.6.0-javadoc.jar; 赠送源代码:lucene-analyzers-common-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-...

    lucene-core-7.2.1-API文档-中文版.zip

    赠送jar包:lucene-core-7.2.1.jar; 赠送原API文档:lucene-core-7.2.1-javadoc.jar; 赠送源代码:lucene-core-7.2.1-sources.jar; 赠送Maven依赖信息文件:lucene-core-7.2.1.pom; 包含翻译后的API文档:lucene...

    lucene-suggest-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-suggest-6.6.0.jar; 赠送原API文档:lucene-suggest-6.6.0-javadoc.jar; 赠送源代码:lucene-suggest-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-suggest-6.6.0.pom; 包含翻译后的API...

    lucene-backward-codecs-7.3.1-API文档-中英对照版.zip

    赠送jar包:lucene-backward-codecs-7.3.1.jar; 赠送原API文档:lucene-backward-codecs-7.3.1-javadoc.jar; 赠送源代码:lucene-backward-codecs-7.3.1-sources.jar; 赠送Maven依赖信息文件:lucene-backward-...

    lucene-core-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-core-6.6.0.jar; 赠送原API文档:lucene-core-6.6.0-javadoc.jar; 赠送源代码:lucene-core-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-core-6.6.0.pom; 包含翻译后的API文档:lucene...

    lucene-analyzers-smartcn-7.7.0-API文档-中英对照版.zip

    赠送jar包:lucene-analyzers-smartcn-7.7.0.jar; 赠送原API文档:lucene-analyzers-smartcn-7.7.0-javadoc.jar; 赠送源代码:lucene-analyzers-smartcn-7.7.0-sources.jar; 赠送Maven依赖信息文件:lucene-...

    lucene-spatial-extras-7.3.1-API文档-中英对照版.zip

    赠送jar包:lucene-spatial-extras-7.3.1.jar; 赠送原API文档:lucene-spatial-extras-7.3.1-javadoc.jar; 赠送源代码:lucene-spatial-extras-7.3.1-sources.jar; 赠送Maven依赖信息文件:lucene-spatial-extras...

    lucene-memory-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-memory-6.6.0.jar; 赠送原API文档:lucene-memory-6.6.0-javadoc.jar; 赠送源代码:lucene-memory-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-memory-6.6.0.pom; 包含翻译后的API文档...

    lucene-suggest-7.7.0-API文档-中文版.zip

    赠送jar包:lucene-suggest-7.7.0.jar; 赠送原API文档:lucene-suggest-7.7.0-javadoc.jar; 赠送源代码:lucene-suggest-7.7.0-sources.jar; 赠送Maven依赖信息文件:lucene-suggest-7.7.0.pom; 包含翻译后的API...

    lucene-spatial-extras-7.2.1-API文档-中英对照版.zip

    赠送jar包:lucene-spatial-extras-7.2.1.jar; 赠送原API文档:lucene-spatial-extras-7.2.1-javadoc.jar; 赠送源代码:lucene-spatial-extras-7.2.1-sources.jar; 赠送Maven依赖信息文件:lucene-spatial-extras...

    lucene-sandbox-7.2.1-API文档-中文版.zip

    赠送jar包:lucene-sandbox-7.2.1.jar; 赠送原API文档:lucene-sandbox-7.2.1-javadoc.jar; 赠送源代码:lucene-sandbox-7.2.1-sources.jar; 赠送Maven依赖信息文件:lucene-sandbox-7.2.1.pom; 包含翻译后的API...

    lucene-spatial-extras-6.6.0-API文档-中英对照版.zip

    赠送jar包:lucene-spatial-extras-6.6.0.jar; 赠送原API文档:lucene-spatial-extras-6.6.0-javadoc.jar; 赠送源代码:lucene-spatial-extras-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-spatial-extras...

    lucene-spatial-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-spatial-6.6.0.jar; 赠送原API文档:lucene-spatial-6.6.0-javadoc.jar; 赠送源代码:lucene-spatial-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-spatial-6.6.0.pom; 包含翻译后的API...

    lucene-misc-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-misc-6.6.0.jar; 赠送原API文档:lucene-misc-6.6.0-javadoc.jar; 赠送源代码:lucene-misc-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-misc-6.6.0.pom; 包含翻译后的API文档:lucene...

    lucene-core-6.6.0-API文档-中英对照版.zip

    赠送jar包:lucene-core-6.6.0.jar; 赠送原API文档:lucene-core-6.6.0-javadoc.jar; 赠送源代码:lucene-core-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-core-6.6.0.pom; 包含翻译后的API文档:lucene...

    lucene-backward-codecs-6.6.0-API文档-中英对照版.zip

    赠送jar包:lucene-backward-codecs-6.6.0.jar; 赠送原API文档:lucene-backward-codecs-6.6.0-javadoc.jar; 赠送源代码:lucene-backward-codecs-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-backward-...

    lucene-backward-codecs-6.6.0-API文档-中文版.zip

    赠送jar包:lucene-backward-codecs-6.6.0.jar; 赠送原API文档:lucene-backward-codecs-6.6.0-javadoc.jar; 赠送源代码:lucene-backward-codecs-6.6.0-sources.jar; 赠送Maven依赖信息文件:lucene-backward-...

Global site tag (gtag.js) - Google Analytics