<?php
function getUrl($url, $header = false)
{
$ch = curl_init($url);
curl_setopt($ch,CURLOPT_HEADER,0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //返回数据不直接输出
//curl_setopt($ch, CURLOPT_ENCODING, "gzip"); //指定gzip压缩
//add header
if(!empty($header)) {
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
}
//add ssl support
if(substr($url, 0, 5) == 'https') {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //SSL 报错时使用
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); //SSL 报错时使用
}
//add 302 support
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
//curl_setopt($ch,CURLOPT_COOKIEFILE, $this->lastCookieFile); //使用提交后得到的cookie数据
$content = curl_exec($ch); //执行并存储结果
$curlError = curl_error($ch);
if(!empty($curlError)) {
print_r($curlError);
}
curl_close($ch);
return $content;
}
$content=getUrl("https://www.iesdouyin.com/share/user/1662082192775240");//抖音个人主页,最后一串数字为每个人的抖音个人主页ID
preg_match_all ("/<span class=\"follower block\">(.*)粉丝<\/span>/U", $content, $arr);//正则一时爽,用完***
print_r($arr);
preg_match_all ("/<i class=\"icon iconfont follow-num\">(.*)<\/i>/U",$arr[1][0], $arri);//这个/U起到了迷之作用,不加的话就不能匹配多个<i>标签,而是最开始的<i>和最后的</i>
/*
/u 表示按unicode(utf-8)匹配(主要针对多字节比如汉字)
/i 表示不区分大小写(如果表达式里面有 a, 那么 A 也是匹配对象)
/s 表示将字符串视为单行来匹配
*/
print_r($arri);
//半成品代码,接下来替换下iconfont字符就好了。。
//密码表
```html
codes = {
"": "0", "": "0", "": "0",
"": "1", "": "1", "": "1",
"": "2", "": "2", "": "2",
"": "3", "": "3", "": "3",
"": "4", "": "4", "": "4",
"": "5", "": "5", "": "5",
"": "6", "": "6", "": "6",
"": "7", "": "7", "": "7",
"": "8", "": "8", "": "8",
"": "9", "": "9", "": "9"
}
?>
Hellozjx
人活着一定要争口气,哪怕生为草绳,也要做绑螃蟹的那根。
php中利用curl和preg_match_all抓取抖音粉丝数据
hellozjx·2020-04-16·800 次阅读
Comments | NOTHING