I need to scrap <a>
tags in html.
My goal is to scrap tags that has valid links inside their href attribute.
I think i'm very close to the answer, and this is the regex I wrote:
<a .*href=("|').*\.asp("|').*?>.*?<\/a>
第一个问题:
结果:
<a id='topnavbtn_tutorials' href='javascript:void(0);' onclick='w3_open_nav("tutorials")' title='Tutorials'>TUTORIALS <i class='fa fa-caret-down'></i><i class='fa fa-caret-up' style='display:none'></i></a><a id='topnavbtn_references' href='javascript:void(0);' onclick='w3_open_nav("references")' title='References'>REFERENCES <i class='fa fa-caret-down'></i><i class='fa fa-caret-up' style='display:none'></i></a><a id='topnavbtn_examples' href='javascript:void(0);' onclick='w3_open_nav("examples")' title='Examples'>EXAMPLES <i class='fa fa-caret-down'></i><i class='fa fa-caret-up' style='display:none'></i></a><a href='/forum/default.asp'>FORUM</a>
我只需要:
<a href='/forum/default.asp'>FORUM</a>
第二个问题:
结果:
<a href='/html/default.asp' class='w3-hide-small' title='HTML Tutorial'>HTML</a><a href='/css/default.asp' class='w3-hide-small' title='CSS Tutorial'>CSS</a><a href='/js/default.asp' class='w3-hide-small' title='JavaScript Tutorial'>JAVASCRIPT</a><a href='/sql/default.asp' class='w3-hide-small' title='SQL Tutorial'>SQL</a><a href='/php/default.asp' class='w3-hide-small' title='PHP Tutorial'>PHP</a><a href='/bootstrap/default.asp' class='w3-hide-small' title='Bootstrap Tutorial'>BOOTSTRAP</a><a href='/jquery/default.asp' class='w3-hide-small' title='jQuery Tutorial'>JQUERY</a><a href='/angular/default.asp' class='w3-hide-small' title='Angular Tutorial'>ANGULAR</a><a href='/xml/default.asp' class='w3-hide-small' title='XML Tutorial'>XML</a>
我需要它们作为单独的结果:
<a href='/html/default.asp' class='w3-hide-small' title='HTML Tutorial'>HTML</a>
<a href='/css/default.asp' class='w3-hide-small' title='CSS Tutorial'>CSS</a>
<a href='/js/default.asp' class='w3-hide-small' title='JavaScript Tutorial'>JAVASCRIPT</a>
依旧......
答案 0 :(得分:1)
<强>更新。见下文。
如果你有字符串形式的HTML,你可以这样做:
// split the string up by anchor tags
// nested anchor tags is illegal, so this seems feasible:
var anchorArray = str.replace(/><a/g, '>¶<a').split('¶'); // ¶ is a placeholder to split
var matches = [];
var re = /<a .*href=["'].*\.asp["'].*?>.*?<\/a>/g;
// filter out the anchor elements with actual links in the final HTML
anchorArray.filter(function(element) {
if (re.test(element)) {
matches.push(element); // keep the match in an array (2nd condition)
return false;
}
else return true;
});
var returnedHTML = anchorArray.join(''); // HTML w/o actual links (1st condition)
请注意,解析HTML的首选方法不是使用正则表达式,而是使用HTML解析器。
答案 1 :(得分:0)
这会对你有所帮助
var matches = [];
input_content.replace(/[^<]*(<a href="([^"]+)">/w*<\a>)/g, function () {
matches.push(Array.prototype.slice.call(arguments, 1))
});
它将匹配变量的所有匹配作为数组返回!
答案 2 :(得分:-1)
public class MainActivity extends Activity {
TextView touchedXY, invertedXY, imgSize, colorRGB;
ImageView imgSource1;
Button b;
static final int CAMREQUEST = 1;
Uri fileUri;
@Override
public void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
touchedXY = (TextView) findViewById(R.id.xy);
invertedXY = (TextView) findViewById(R.id.invertedxy);
imgSize = (TextView) findViewById(R.id.size);
colorRGB = (TextView) findViewById(R.id.colorrgb);
b = (Button) findViewById(R.id.Button01);
b.setOnClickListener(new View.OnClickListener() {
@Override
public void onClick(View v) {
Intent cameraIntent = new Intent(android.provider.MediaStore.ACTION_IMAGE_CAPTURE);
fileUri = null;
fileUri = Uri.fromFile(new File(mediapath.getPath(), "Image" + System.currentTimeMillis() + ".jpg"));
intent.putExtra(MediaStore.EXTRA_OUTPUT, fileUri);
startActivityForResult(cameraIntent, CAMREQUEST);
}
});
}
protected void onActivityResult(int requestCode, int resultCode, Intent data) {
if (requestCode == CAMREQUEST) {
Bitmap bitmap = MediaStore.Images.Media.getBitmap(this.getContentResolver(), fileUri);
imgSource1 = (ImageView) findViewById(R.id.source1);
imgSource1.setImageBitmap(image);
imgSource1.setOnTouchListener(imgSourceOnTouchListener);
}
}
OnTouchListener imgSourceOnTouchListener
= new OnTouchListener() {
@Override
public boolean onTouch(View view, MotionEvent event) {
float eventX = event.getX();
float eventY = event.getY();
float[] eventXY = new float[]{eventX, eventY};
Matrix invertMatrix = new Matrix();
((ImageView) view).getImageMatrix().invert(invertMatrix);
invertMatrix.mapPoints(eventXY);
int x = Integer.valueOf((int) eventXY[0]);
int y = Integer.valueOf((int) eventXY[1]);
touchedXY.setText(
"touched position: "
+ String.valueOf(eventX) + " / "
+ String.valueOf(eventY));
invertedXY.setText(
"touched position: "
+ String.valueOf(x) + " / "
+ String.valueOf(y));
Drawable imgDrawable = ((ImageView) view).getDrawable();
Bitmap bitmap = ((BitmapDrawable) imgDrawable).getBitmap();
imgSize.setText(
"drawable size: "
+ String.valueOf(bitmap.getWidth()) + " / "
+ String.valueOf(bitmap.getHeight()));
//Limit x, y range within bitmap
if (x < 0) {
x = 0;
} else if (x > (bitmap.getWidth() - 1)) {
x = bitmap.getWidth() - 1;
}
if (y < 0) {
y = 0;
} else if (y > (bitmap.getHeight() - 1)) {
y = bitmap.getHeight() - 1;
}
int touchedRGB = bitmap.getPixel(x, y);
colorRGB.setText("touched color: " + "#" + Integer.toHexString(touchedRGB));
colorRGB.setTextColor(touchedRGB);
return true;
}
};
输出:
$string = "<a href='/html/default.asp' class='w3-hide-small' title='HTML Tutorial'>HTML</a><a href='/css/default.asp' class='w3-hide-small' title='CSS Tutorial'>CSS</a><a href='/js/default.asp' class='w3-hide-small' title='JavaScript Tutorial'>JAVASCRIPT</a><a href='/sql/default.asp' class='w3-hide-small' title='SQL Tutorial'>SQL</a><a href='/php/default.asp' class='w3-hide-small' title='PHP Tutorial'>PHP</a><a href='/bootstrap/default.asp' class='w3-hide-small' title='Bootstrap Tutorial'>BOOTSTRAP</a><a href='/jquery/default.asp' class='w3-hide-small' title='jQuery Tutorial'>JQUERY</a><a href='/angular/default.asp' class='w3-hide-small' title='Angular Tutorial'>ANGULAR</a><a href='/xml/default.asp' class='w3-hide-small' title='XML Tutorial'>XML</a>";
preg_match_all('%<a href=\'/.*?\'>.*?</a>%s', $string, $matches, PREG_PATTERN_ORDER);
for ($i = 0; $i < count($matches[0]); $i++) {
echo $matches[0][$i];
}
样本:
注意:强>