Open airuikun opened 5 years ago
html.replace(/<([\w-]+)(?=\s)(?=[^>]*?(\shref=(['"]).*?\3)?)(?=[^>]*?(\ssrc=(['"]).*?\5)?)[^>]*/g, '<$1$2$4')
匹配标签的属性内容:<([\w-]+)(.*?)>
匹配对应每项属性:\s?([\w-]+)=['"]?[\w-\s]+['"]?
var html = `<div title="test">
<icon v-if="icon" name="info" class="m-message--icon" slot="icon" />
<img src="http://img.123.jpg" title="img"/>
<input checked=true data-value="12" checked/>
<a href="#/test" on-click="jump"></a>
<m-alert title='Alert message'>alert</m-alert>
</div>`
var whiteList = [ 'href', 'src' ]
html.replace(/<([\w-]+)(.*?)>/g, function (ori, tag, content) {
console.log(1, arguments)
return ori.replace(/\s?([\w-]+)=['"]?[\w-\s]+['"]?/g, function (attr, name) {
console.log(2, arguments)
if (whiteList.indexOf(name) > -1) return attr
return ''
})
})
以上例子还是存在一些属性无法去除,比如没有值的属性
正则写不来,但是好像可以直接js获取元素来实现,哈哈哈哈
/\s(?!src=|href=)[\w-]*=['\w"]+/
还是要限制一下尖括号吧,不然正文里的也匹配上了
来个boss级的终极版,参考引用于vue源码 /\s(?!href=|src=)[^\s"'<>\/=]+(?:\s(=)\s(?:"([^"])"+|'([^'])'+|([^\s"'=<>`]+)))?/g
贡献一个测试用例:
<div value="src='AAA AAA'" src='BBBBBB' href=abcabc controls>
value="src='AAA AAA'" src='BBBBBB' href=abcabc
</div>
双引号里带空格、单引号的 单引号里带空格、双引号的 没有引号的 不属于 attribute 的 没有值的 attribute
//参考了vue源码看着貌似有点长
'<div class="xxxx" href="xxxddx" >sdfsdf</div><span name="xxsdf" src="sdfsdf"></span>'.
replace(/<([a-zA-Z_][\w\-\.]*\s*)((?:\s*(?:[^\s"'<>\/=]+)(?:\s*(?:=)\s*(?:"(?:[^"]*)"+|'(?:[^']*)'+|(?:[^\s"'=<>`]+)))?)*)(\s*(?:\/)?\s*)>/g,(s,s1,s2,s3)=>{
var hrefSrc = (s2 || '').match(/\s*(?:href|src)(?:\s*(?:=)\s*(?:"(?:[^"]*)"+|'(?:[^']*)'+|(?:[^\s"'=<>`]+)))?/g);
return '<' + s1 + (hrefSrc || []).join('') + s3 + '>'
});//<div href="xxxddx" >sdfsdf</div><span src="sdfsdf"></span>
撸了一个,不知道对第七条用例算不算失败的 增强版:处理了第七条用例同时有href和src的情况,如果出现多个href我表示是否思考不应该在正则这个层面处理了。。
const list = [
'<a test=adsf>asdf</a>asdfasdf<p>adf</p>',
'<a href=adsf>asdf</a>asdfasdf<p>adf</p>',
'<a href="adsf">asdf</a>asdfasdf<p>adf</p>',
'<a href="adsf">asdf</a>asdfasdf<p>adf</p>',
'<a aa="asdfads" href="adsf">asdf</a>asdfasdf<p>adf</p>',
'<a href="adsf" aa="asdfads">asdf</a>asdfasdf<p>adf</p>',
'<a aa="asdfads" href="adsf" aa="asdfads">asdf</a>asdfasdf<p>adf</p>',
'<a aa="asdfads" href="adsf" aa="asdfads" src="adsf" aa="asdfads">asdf</a>asdfasdf<p>adf</p>',
`
<a aa="asdfads"
href="adsf"
aa="asdfads">asdf</a>asdfasdf<p>adf</p>
`,
`
<div value="src='AAA AAA'" src='BBBBBB' href=abcabc>
value="src='AAA AAA'" src='BBBBBB' href=abcabc
</div>`,
'<script src="adsf"/>asdf<link href="adsf" />asdf<p>adf</p>',
];
const regs = {
mine: {
search: /(<\w+)[^>]*(\s+\b(href|src)=("|'|)[^\s>]*\4)(?:\s)?[^>]*?(?=>)/g,
replace: '$1$2',
},
enhanced: {
search: /(?<=<\w+\b)([^>]*?(\s+\b(href|src)=("|'|)[^\s>]*\4))?([^>]*?(\s+\b(href|src)=("|'|)[^\s>]*\8))?[^>]*?(?=\/?>)/g,
replace: '$2$6',
}
};
function run(reg, testIndex = -1) {
for (const [index, item] of list.entries()) {
if (testIndex < 0) {
console.log('▶', index, item);
let result = item.replace(reg.search, reg.replace);
console.log('result:', result, '\n');
} else if (testIndex === index) {
console.log('▶', index, item);
let match = reg.search.exec(item);
console.log(match);
let result = item.replace(reg.search, reg.replace);
console.log('result:', result, '\n');
}
}
}
run(regs.enhanced, -1);
//给我点差评
;( () => {
str = `<a afa href="http://fanyi-pro.baidu.com/?hmsr=%E7%99%BE%E5%BA%A6%E7%BF%BB%E8%AF%91&hmpl=%E5%9B%BA%E5%AE%9A%E5%85%A5%E5%8F%A3&hmcu=%E9%A1%B6%E9%83%A8%E6%8C%89%E9%92%AE&hmkw=&hmci=" target="_blank" class="list-name" src="" f="" aefa>人工翻译</a>
<div id="search-box" class="search-box-new line">
<ul class="channel grid">
<li><a log="sc_pos:c_baidu" data-type='baidu' rel="nofollow" href="http://www.baidu.com/s?cl=3&wd=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD">网页</a></li>
<li><a log="sc_pos:c_news" data-type='news' rel="nofollow" href="https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&fr=zhidao">资讯</a></li>
<li><a log="sc_pos:c_video" data-type='video' rel="nofollow" href="https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&rsv_spt=16">视频</a></li>
<li><a log="sc_pos:c_pic" data-type='image' rel="nofollow" href="http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&word=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&t=3&ie=gbk">图片</a></li>
<li><strong>知道</strong></li>
<li><a log="sc_pos:c_doc" data-type='wenku' rel="nofollow" href="http://wenku.baidu.com/search?word=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&lm=0&od=0">文库</a></li>
<li><a log="sc_pos:c_tieba" data-type='tieba' rel="nofollow" href="http://tieba.baidu.com/f?kw=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&t=4">贴吧</a></li><li><a log="sc_pos:c_b2b" data-type='b2b' rel="nofollow" href="https://b2b.baidu.com/s?q=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&fr=www">采购</a></li>
<li><a log="sc_pos:c_map" data-type='map' rel="nofollow" href="http://map.baidu.com/m?word=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&fr=map007">地图</a></li><li><a log="sc_pos:c_more" data-type="more" href="http://www.baidu.com/more/">更多»</a></li>
</ul>
<div class="search-block clearfix">
<div class="search-cont clearfix">
<a class="logo" href="/" title="百度知道"></a>
<form action="/search" name="search-form" method="get" id="search-form-new" class="search-form">
<input class="hdi" id="kw" maxlength="256" tabindex="1" size="46" name="word" value="chrome书签本地文件在什么地方" autocomplete="off" placeholder="" />
<button alog-action="g-search-anwser" type="submit" id="search-btn" hidefocus="true" tabindex="2" class="btn-global">搜索答案</button>
<a href="#" alog-action="g-i-ask" class="i-ask-link" id="ask-btn-new">我要提问</a>
</form>
</div>
</div>
</div>
`
reg = /<[A-z-]+(?:\s((?!href|src)[a-z-]+)=(?:'([^"]+|"[^"]+")+'|"([^']+|'[^']+')+"|''|""|[a-z]+))|\s[a-z]+>/igm
reg2 = /(?:\s((?!href|src)[a-z-]+)(="[^"<>]+"|='[^'<>]+'|=""|=''|(=|)[A-z0-9]+))/igm
reg_html_one = /(<[A-z]+[^<>]+(\/>|>))/igm
re = str.replace(reg_html_one,($1) => {
var re2 = $1.replace(reg2,'')
return re2
})
console.log(re)
})()
//结果正确
//<a href="http://fanyi-pro.baidu.com/?hmsr=%E7%99%BE%E5%BA%A6%E7%BF%BB%E8%AF%91&hmpl=%E5%9B%BA%E5%AE%9A%E5%85%A5%E5%8F%A3&hmcu=%E9%A1%B6%E9%83%A8%E6%8C%89%E9%92%AE&hmkw=&hmci=" src="">人工翻译</a>
<div>
<ul>
<li><a href="http://www.baidu.com/s?cl=3&wd=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD">网页</a></li>
<li><a href="https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&fr=zhidao">资讯</a></li>
<li><a href="https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&rsv_spt=16">视频</a></li>
<li><a href="http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&word=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&t=3&ie=gbk">图片</a></li>
<li><strong>知道</strong></li>
<li><a href="http://wenku.baidu.com/search?word=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&lm=0&od=0">文库</a></li>
<li><a href="http://tieba.baidu.com/f?kw=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&t=4">贴吧</a></li><li><a href="https://b2b.baidu.com/s?q=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&fr=www">采购</a></li>
<li><a href="http://map.baidu.com/m?word=chrome%CA%E9%C7%A9%B1%BE%B5%D8%CE%C4%BC%FE%D4%DA%CA%B2%C3%B4%B5%D8%B7%BD&fr=map007">地图</a></li><li><a href="http://www.baidu.com/more/">更多»</a></li>
</ul>
<div>
<div>
<a href="/"></a>
<form>
<input />
<button >搜索答案</button>
<a href="#">我要提问</a>
</form>
</div>
</div>
</div>
我目前未解决的问题是单双引号嵌套如何确保引号正确
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<!-- <script js="" src="/respond.min.js"></script> -->
<!--[if lt IE 9]>
<script src="/html5shiv.min.js"></script>
<![endif]-->
<title>Document</title>
</head>
<body>
<textarea name="" id="test" cols="110" rows="110"></textarea>
<script type="text/javascript">
var a = "<div//>< hef='s' a><a/>";
var b = "<script ja><\/script><a/>";
var b = true || false;
</script>
<script>
var html = "";
var breakDeep = 0;
document.getElementsByTagName("html")[0].innerHTML.match(/<[^>]+>|[^<|>|\w]|\w+\b|[\s\|\>]+/mg).forEach(function(item, index){
// script应尽量避免出现,判断不严谨
if(/<script/.test(item)){
html += item.replace(/\s(?!src=|href=)[\w-]*=('|").*?\1/g, '');
breakDeep++;
}else if(/<\/script/.test(item)){
breakDeep--;
}
if(breakDeep > 0){
if(/<script/.test(item) == false){
html += item;
}
}else{
html += item.replace(/\s(?!src=|href=)[\w-]*=('|").*?\1/g, '');
}
});
test.value = html;
</script>
</body>
</html>
repost
正则写不来,但是好像可以直接js获取元素来实现,哈哈哈哈
脱离浏览器你怎么办
答案呢?
我觉得,首先正则很强大,但不要低估html的容错性(比如非闭合,嵌套错误),而且就算是格式化后的html标签,也有特例,比如
<textarea name="" id="" cols="30" rows="10">
<a value="src='AAA AAA'" src='BBBBBB' href=abcabc wtf=弄啥嘞>
value="src='AAA AAA'" src='BBBBBB' href=abcabc
</textarea>
就连语法高亮都认为textarea
里面是标签呢~
其次正则并不能匹配所有的邮箱,参考https://github.com/kdeldycke/awesome-falsehood#emails
期待答案
这题目简单的理解就是,写一个正则表达式,将字符串'正则'转化成'正则'。
当然,真正包含一个网页的html的字符串要比这个复杂。
而且,google里关于这个问题的前三篇文章答案,都存在严重的问题,随便写几个case都是满足不了的。
正则的问题,很多前端人员都停留在如何用正则去判断一个数字是不是手机号,一段字符串是不是邮箱,说实话,这都没用到正则知识体系的十分之一
在一些工程项目难题上,如果正则使用到位,真的是一行正则可以抵1000行代码。
建议有能力的小伙伴,可以玩一下这题。