|
程序说明:
这是一个朋友需要用(UniBI ?),所以简单写了一个。程序的作用是,通过在命令行中指定百度关键字,然后定义要下载的结果页的页数。程序运行后,会自动下载所有的搜索结果页,严格的说,不是所有,为了方便起见,对百度知道,贴吧这些网页不做收集。程序代码如下:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main (int argc, char **argv)
{
int i;
char itochar[5];
int len;
FILE *stream;
int dweb = 0;
int dwebstr;
char bd[20] = {" baidu"};
char *bdhtml = ".html ";
char godown[512] = {"curl -o"};
char urlmsg[512];
char *purl = urlmsg;
char serurl[30] = {"'http://www.baidu.com/s?wd="};
char urlsufix[5] = {"&pn="};
char buf[5];
if (argc == 1) {
printf ("使用方法: ./web 关键词 下载结果页的页数(限制在9页内)\n");
exit (1);
}
dweb = atoi (argv[2]);
if (dweb > 9) {
printf ("下载页数在10页内\n");
exit (1);
}
/*下载搜索结果页*/
for (i = 0; i <= dweb; i ++) {
sprintf (itochar, "%d", i);
strcat (bd, itochar); /*变成 baidu(x)..x为网页号*/
strcat (bd, bdhtml); /*如变成 baidu7.html*/
strcat (godown, bd); /*连接成 curl -o baidu7.html 的形式(后面有1空格)*/
/*curl -o baidux.html 'http://www.baidu.com/s?wd=*/
strcat (godown, serurl);
dwebstr = i * 10;
sprintf (buf, "%d", dwebstr); /*页码设置*/
strcat (godown, argv[1]);
strcat (godown, urlsufix);
strcat (godown, buf);
len = strlen (godown);
godown[len] = '\'';
godown[len + 1] = '\0'; /*命令设置完毕*/
system (godown);
sprintf (godown, "%s", "curl -o ");
sprintf (bd, "%s", " baidu");
}
system ("sh filter.sh");
stream = fopen ("merge.txt", "r");
if (stream == NULL) {
perror ("errno");
exit (1);
}
i = 1; /*下载网页命名号*/
fread (purl, 1, 1, stream);
while ( !feof (stream) ) {
while (*purl != '\n') {
purl += 1;
fread (purl, 1, 1, stream);
}
*purl = '\0'; /*读取一行URL结束*/
sprintf (buf, "%d", i);
strcat (godown, buf); /*curl -o 1*/
strcat (godown, bdhtml); /*curl -o x.html*/
strcat (godown, urlmsg); /*curl -o x.html url*/
system (godown);
i += 1;
fread (purl, 1, 1, stream);
sprintf (godown, "%s", "curl -o ");
purl = urlmsg; /*重新定位*/
}
close (stream);
return (0);
} 主要是利用了 curl 的下载功能以及脚本做了网页的 URL 提取。脚本内容如下:
#!/bin/bash
num=0
for file in baidu*.html; do
sed '/<table border="0" cellpadding="0" cellspacing="0" id=/!d' $file> $file.txt
awk -Fhref=\" '{print $2}' $file.txt > $file.txt.txt
rm -f $file.txt
awk -F\/\" '{print $1}' $file.txt.txt > $file.txt
rm -f $file.txt.txt
awk -Fawk -F\"\ \ target\=\" '{print $1}' $file.txt > $file.txt.txt
mv $file.txt.txt $file.txt
done
for file in *.txt; do
cat $file>> merge.txt
done
rm -f baidu*.html.txt
exit 0 |
|