如何使用nutch将Hbase数据导出到弹性搜索?

时间:2015-07-30 13:08:16

标签: nutch

我跟着https://gist.github.com/xrstf/b48a970098a8e76943b9整合了nutch和弹性搜索。一切正常,数据存储在Hbase'网页'表但我无法在弹性搜索中获取数据。我想知道如何在弹性搜索中获取数据。

1 个答案:

答案 0 :(得分:0)

下面是我的代码

package com.process;
/*
  import package will be here
*/

public class HbaseToElastic extends Configured implements
    org.apache.hadoop.util.Tool {

static class Mapper extends TableMapper<Text, IndexWritable> {

    public static String CLUSTER;
    public static String SEARCH_HOST;
    public static String SEARCH_PORT;
    public static String SEARCH_INDEX_NAME;
    public static String SEARCHtYPE;
    public static int BULKSIZE;
    public static String TABLENAME;
    public static String FAMILY;
    private static List<String> SPORTS_KEYWORDS;
    private static List<String> BUSINESS_KEYWORDS;
    private static List<String> GOSSIP_KEYWORDS;
    private static List<String> CRIME_KEYWORDS;
    private static Map<String, Map<String, String>> STATE_MAP = new HashMap<String, Map<String, String>>();
    private static Map<String, String> CITY_MAP = new HashMap<String, String>();

    private static Mapper mapper = new Mapper();

    static {
        try {
            System.out.println("done1");
            DetectorFactory.loadProfile("./profiles");
            System.out.println("done2");
        } catch (final LangDetectException e) {
            System.out.println("done3");
            e.printStackTrace();
        }

    }

    Configuration hbaseConf = null;
    HTable table = null;

    List<Put> hbasePutErrorList = new ArrayList<Put>();

    /**
     * Clean up the hbase table object
     */
    @Override
    protected void cleanup(final Context context) throws IOException,
            InterruptedException {
        super.cleanup(context);

        table.put(hbasePutErrorList);
        table.close();
        hbasePutErrorList.clear();
    }

    /**
     * Initialize various variables
     */
    @Override
    protected void setup(
            final org.apache.hadoop.mapreduce.Mapper<ImmutableBytesWritable, Result, Text, IndexWritable>.Context context)
            throws IOException, InterruptedException {
        final Configuration conf = context.getConfiguration();

        CLUSTER = conf.get("cluster");
        SEARCH_HOST = conf.get("search_host");
        SEARCH_PORT = conf.get("search_port");
        SEARCH_INDEX_NAME = conf.get("search_index_name");
        SEARCHtYPE = conf.get("search_type");
        BULKSIZE = conf.getInt("search_bulk_size", 500);
        TABLENAME = conf.get("table_name");
        FAMILY = conf.get("family");

        hbaseConf = HBaseConfiguration.create();
        hbaseConf.set("hbase.zookeeper.quorum",
                conf.get("hbase.zookeeper.quorum"));
        hbaseConf.set("hbase.zookeeper.property.clientPort",
                conf.get("hbase.zookeeper.property.clientPort"));
        hbaseConf.set("hbase.rpc.timeout", conf.get("hbase.rpc.timeout"));
        hbaseConf.set("hbase.regionserver.lease.period",
                conf.get("hbase.regionserver.lease.period"));
        hbaseConf.set("hbase.master", conf.get("hbase.master"));
        table = new HTable(hbaseConf, conf.get("table_name"));

        SPORTS_KEYWORDS = new ArrayList<String>();
        BUSINESS_KEYWORDS = new ArrayList<String>();
        GOSSIP_KEYWORDS = new ArrayList<String>();
        CRIME_KEYWORDS = new ArrayList<String>();
        String keywrods = conf.get("sportskeywords");
        String[] keyarr = keywrods.split(",");
        for (final String key : keyarr) {
            SPORTS_KEYWORDS.add(key.trim());
        }
        keywrods = conf.get("businesskeywords");
        keyarr = keywrods.split(",");
        for (final String key : keyarr) {
            BUSINESS_KEYWORDS.add(key.trim());
        }
        keywrods = conf.get("gossipkeywords");
        keyarr = keywrods.split(",");
        for (final String key : keyarr) {
            GOSSIP_KEYWORDS.add(key.trim());
        }
        keywrods = conf.get("crimekeywords");
        keyarr = keywrods.split(",");
        for (final String key : keyarr) {
            CRIME_KEYWORDS.add(key.trim());
        }

        final String stateMap = conf.get("statemap");
        final Gson g = new Gson();

        STATE_MAP = g.fromJson(stateMap, Map.class);

    }

    /**
     * map function
     */
    @Override
    public void map(final ImmutableBytesWritable row, final Result result,
            final Context context) throws IOException, InterruptedException {
        try {
            final byte b = 0;
            int deleteFlag = 0;
            final String keyString = Bytes.toString(row.get());
            final Map<String, Object> mapobject = new HashMap<String, Object>();
            for (final KeyValue kv : result.raw()) {
                final String key = (new String(kv.getQualifier()));
                final String value = (new String(kv.getValue()));
                mapobject.put(key, value);
            }
            final Gson g = new Gson();
            if (checkValidType(mapobject)) {
                refineMetaTags(mapobject);
                if (refineDescription(mapobject)) {
                    assignCity(mapobject);
                    if (checkTitleImage(mapobject)) {
                        if (setLang(mapobject)) {
                            setCorrectCategory(mapobject);
                            correctDuplicateTitle(mapobject);
                            final String json = g.toJson(mapobject);
                            context.write(new Text(keyString),
                                    new IndexWritable(json, b));
                            deleteFlag = 1;
                        }
                    }
                }
            }
            if (deleteFlag == 0) {
                final Put put = new Put(Bytes.toBytes(keyString));
                put.add(Bytes.toBytes("cf"), Bytes.toBytes("ErrorFlag"),
                        Bytes.toBytes("1"));
                hbasePutErrorList.add(put);
            }
        } catch (final Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Remove duplicate statement in the title
     *
     * @param mapobject
     */
    private void correctDuplicateTitle(final Map<String, Object> mapobject) {
        final String duplicateTitle = mapobject.get("title").toString();
        final String stripedTitleArr[] = duplicateTitle.split(" ", 4);
        if (stripedTitleArr.length == 4) {
            final String subString = stripedTitleArr[0] + " "
                    + stripedTitleArr[1] + " " + stripedTitleArr[2];
            if (stripedTitleArr[3].contains(subString)) {

                mapobject.put("title", duplicateTitle
                        .substring(duplicateTitle.indexOf(subString,
                                subString.length() - 1)));

                mapobject.put("title", stripedTitleArr[3]
                        .substring(stripedTitleArr[3].indexOf(subString)));
            }
        }
    }

    /**
     * Set category based on the various category specific keyword
     *
     * @param mapobject
     */
    private void setCorrectCategory(final Map<String, Object> mapobject) {
        final String url = mapobject.get("url") + "";
        final String cat = mapobject.get("tags") + "";
        if ("sports".equalsIgnoreCase(cat)
                || "cricket".equalsIgnoreCase(cat)) {
            if (!(url.toLowerCase().contains("sport")
                    || url.toLowerCase().contains("खेल")
                    || url.toLowerCase().contains("cric") || url
                    .toLowerCase().contains("क्रिकेट"))) {
                final String desc = mapobject.get("description").toString();
                boolean isSports = false;
                int count = 0;
                for (final String keyword : SPORTS_KEYWORDS) {
                    if (desc.contains(keyword)) {
                        count++;
                    }
                }
                if (count > 1) {
                    isSports = true;
                }
                if (!isSports) {
                    mapobject.put("tags", "national");
                }
                if (isSports

                        && (desc.contains("क्रिकेट")
                                || url.toLowerCase().contains("cric")
                                || desc.contains("टॉस")
                                || desc.contains("वनडे") || desc
                                    .contains("बल्लेबाज"))) {
                    mapobject.put("tags", "cricket");
                }
            }
        } else if ("business".equalsIgnoreCase(cat)) {
            if ((url.toLowerCase().contains("sport") || url.toLowerCase()
                    .contains("खेल"))) {
                mapobject.put("tags", "sports");
            } else if (url.toLowerCase().contains("cric")
                    || url.toLowerCase().contains("क्रिकेट")) {
                mapobject.put("tags", "cricket");
            } else if (!(url.toLowerCase().contains("busines")
                    || url.toLowerCase().contains("व्यापार")
                    || url.toLowerCase().contains("economy")
                    || url.toLowerCase().contains("finance")
                    || url.toLowerCase().contains("बिजनेस")
                    || url.toLowerCase().contains("market")
                    || url.toLowerCase().contains("karobar") || url
                        .contains("कारोबार"))) {
                final String desc = mapobject.get("description").toString();
                int count = 0;
                for (final String keyword : BUSINESS_KEYWORDS) {
                    if (desc.contains(keyword)) {
                        count++;
                    }
                }
                if (count < 2) {
                    mapobject.put("tags", "national");
                }
            }
        } else if ("gossip".equalsIgnoreCase(cat)) {
            if ((url.toLowerCase().contains("sport") || url.toLowerCase()
                    .contains("खेल"))) {
                mapobject.put("tags", "sports");
            } else if (url.toLowerCase().contains("cric")
                    || url.toLowerCase().contains("क्रिकेट")) {
                mapobject.put("tags", "cricket");
            } else if (url.toLowerCase().contains("busines")) {
                mapobject.put("tags", "business");
            } else if (!(url.toLowerCase().contains("masala")
                    || url.toLowerCase().contains("gossip")
                    || url.toLowerCase().contains("gupshup") || url
                    .toLowerCase().contains("garam"))) {
                final String desc = mapobject.get("description").toString();
                int count = 0;
                for (final String keyword : GOSSIP_KEYWORDS) {
                    if (desc.contains(keyword)) {
                        count++;
                    }
                }
                if (count < 2) {
                    mapobject.put("tags", "national");
                }
            }
        } else if ("crime".equalsIgnoreCase(cat)) {
            if ((url.toLowerCase().contains("sport") || url.toLowerCase()
                    .contains("खेल"))) {
                mapobject.put("tags", "sports");
            } else if (url.toLowerCase().contains("cric")
                    || url.toLowerCase().contains("क्रिकेट")) {
                mapobject.put("tags", "cricket");
            } else if (url.toLowerCase().contains("busines")) {
                mapobject.put("tags", "business");
            } else if (!(url.toLowerCase().contains("crime")
                    || url.toLowerCase().contains("terrorist")
                    || url.toLowerCase().contains("abuse")
                    || url.toLowerCase().contains("forgery")
                    || url.toLowerCase().contains("assault")
                    || url.toLowerCase().contains("violence")
                    || url.toLowerCase().contains("rape")
                    || url.toLowerCase().contains("teasing")
                    || url.toLowerCase().contains("molestation")
                    || url.toLowerCase().contains("scandal") || url
                    .toLowerCase().contains("murder"))) {
                final String desc = mapobject.get("description").toString();
                int count = 0;
                for (final String keyword : CRIME_KEYWORDS) {
                    if (desc.contains(keyword)) {
                        count++;
                    }
                }
                if (count < 2) {
                    mapobject.put("tags", "national");
                }
            }
        } else if (cat != null && cat.startsWith("local")) {

        }
    }

    /**
     * Check valid type of the HTML pages
     *
     * @param mapobject
     * @return
     */
    private boolean checkValidType(final Map<String, Object> mapobject) {
        if (mapobject.containsKey("type")
                && !(mapobject.get("type").toString().contains("image") || mapobject
                        .get("type").toString().contains("rss"))) {
            return true;
        }
        return false;
    }

    /**
     * refine the description according to its length and must starting with
     * english and it the description is not present get the description
     * from the metatags description
     *
     * @param mapobject
     * @return {@link Boolean}
     */
    private boolean refineDescription(final Map<String, Object> mapobject) {

        if (mapobject.containsKey("description")
                && mapobject.get("description").toString().length() > 75
                && !mapobject.get("description").toString().contains(";}")
                && !mapobject.get("description").toString()
                        .contains("<cite>")
                && !mapobject.get("description").toString()
                        .contains("href=")
                && !mapobject.get("description").toString()
                        .contains("All rights reserved")) {
            return true;
        } else if (mapobject.containsKey("metatag.description")
                && mapobject.get("metatag.description").toString().length() > 75
                && !mapobject.get("metatag.description").toString()
                        .contains(";}")
                && !mapobject.get("metatag.description").toString()
                        .contains("<cite>")) {
            mapobject.put("description",
                    mapobject.get("metatag.description"));
            return true;
        }
        return false;
    }

    /**
     * refine metatags by refining meta keyword to only include the English
     * keyword only that has at most three keyword and if not present then
     * create the keyword with title field of the html and if none of the
     * keyword found then form it using the help of the url and exclude the
     * number from the keywords
     *
     * @param mapobject
     */
    private void refineMetaTags(final Map<String, Object> mapobject) {
        String metaTag = "";
        int tagFlag = 0;
        if (mapobject.containsKey("metatag.keywords")) {

            final String metaTags[] = mapobject.get("metatag.keywords")
                    .toString().replaceAll("\\|", ",").split(",");
            String domain = null;
            StringBuilder temp = null;
            for (final String metaTag2 : metaTags) {
                if (mapobject.containsKey("host")) {
                    domain = mapobject.get("host") + "";
                    if (domain.split("\\.").length > 1
                            && (metaTag2
                                    .contains(domain.split("\\.")[domain
                                            .split("\\.").length - 2]) || metaTag2
                                    .contains(domain.split("\\.")[0])))

                    {
                        continue;
                    }
                }
                String[] arr = metaTag2.split(" ");
                arr = removeUnicodeWords(arr);
                if (arr.length > 0 && arr.length < 5) {
                    temp = new StringBuilder();
                    for (final String str : arr) {

                        temp.append(str);
                        temp.append(" ");
                    }
                    if (metaTag.length() + temp.length() < 70) {
                        metaTag = metaTag + "," + temp.toString();
                    }
                }

            }
            if (metaTag.startsWith(",")) {
                metaTag = metaTag.trim();
                metaTag = metaTag.substring(1, metaTag.length());
            }
        }

        if (metaTag.length() < 1 && mapobject.containsKey("title")) {
            /**
             * Extracting tags from the title tag if the length of the
             * keyword is greater than 4
             */
            final String title = (String) mapobject.get("title");
            final String splitTitle[] = title.split(" ");
            int count = 0;
            for (int i = 0; i < splitTitle.length; i++) {
                if (splitTitle[i].length() > 4
                        && !splitTitle[i].matches("^[\\u0900-\\u097F].*")) {
                    metaTag = metaTag + splitTitle[i] + ",";
                    count++;
                    if (count == 5) {
                        break;
                    }
                }
            }
            if (metaTag.split(",").length > 3) {
                if (metaTag.endsWith(",")) {
                    metaTag = metaTag.trim();
                    metaTag = metaTag.substring(0, metaTag.length() - 1);
                }
            } else {
                metaTag = "";
            }
        }
        if (metaTag.length() < 1) {
            /**
             * Extracting the tags from the url if the length of the keyword
             * is greater than 4
             */
            final String splitUrl[] = mapobject.get("url").toString()
                    .split("/");
            final String lastSplitValue = splitUrl[splitUrl.length - 1];

            final String tagList[] = generateTokens(lastSplitValue);
            if (tagList != null) {
                int count = 0;
                for (int i = 0; i < tagList.length; i++) {
                    if (tagList[i].length() > 4
                            && !tagList[i].matches("^[\\u0900-\\u097F].*")) {
                        metaTag = metaTag + tagList[i] + ",";
                        count++;
                        if (count == 5) {
                            break;
                        }
                    }
                }
            }
            if (metaTag.endsWith(",")) {
                metaTag = metaTag.trim();
                metaTag = metaTag.substring(0, metaTag.length() - 1);
            }
        }
        if (metaTag.length() > 0) {
            metaTag = metaTag.replaceAll("\\[", "");
            metaTag = metaTag.replaceAll("\"", "");
            metaTag = metaTag.replaceAll(";", "");
            metaTag = metaTag.replaceAll(":", "");
            metaTag = metaTag.replaceAll("\u0027", "");
            metaTag = metaTag.replaceAll("\u003d", "");
            metaTag = metaTag.replaceAll("\u0026", "");
            tagFlag = 1;
        }
        mapobject.put("TagFlag", tagFlag);
        mapobject.put("metatag.keywords", metaTag);
    }

    /**
     * Remove unicode character
     *
     * @param arr
     * @return
     */
    private String[] removeUnicodeWords(final String[] arr) {
        final List<String> returnArr = new ArrayList<String>();
        for (final String str : arr) {
            if (str != null && str.trim().length() > 3
                    && !str.matches("^[\\u0900-\\u097F].*")
                    && !(str.matches("^[0-9].*"))) {
                returnArr.add(str.trim());
            }
        }
        final String[] retrnArr = new String[returnArr.size()];
        returnArr.toArray(retrnArr);
        return retrnArr;
    }

    /**
     * Generate Token list with the help of the lucene analyzer
     *
     * @param lastSplitValue
     * @return {@link ArrayIndexOutOfBoundsException} of the list of the
     *         keywords
     */
    private String[] generateTokens(String lastSplitValue) {
        final List<String> list = new ArrayList<String>();
        lastSplitValue = lastSplitValue.replace("\\.", " ").replace("%20",
                " ");
        try {
            final Version matchVersion = Version.LUCENE_45;
            final Analyzer analyzer = new HindiAnalyzer(matchVersion);
            final TokenStream ts = analyzer.tokenStream("field",
                    new StringReader(lastSplitValue));
            ts.reset();
            while (ts.incrementToken()) {
                final CharTermAttribute cta = ts
                        .getAttribute(CharTermAttribute.class);
                if (cta.toString().length() > 4
                        && !cta.toString().matches("^[0-9].*")) {
                    list.add(cta.toString());
                }
            }
            ts.end();
            ts.close();
            analyzer.close();
        } catch (final Exception e) {
            e.printStackTrace();
        }
        if (list.size() > 3) {
            return list.toArray(new String[list.size()]);
        } else {
            return null;
        }
    }

    /**
     * Checks title and assign their language based on their first character
     * of the title
     *
     * @param mapobject
     * @return {@link Map}
     */
    private boolean setLang(final Map<String, Object> mapobject) {
        final String title = mapobject.get("title").toString();
        final String description = mapobject.get("title").toString();
        String language = "";
        try {
            language = mapper.detect(title);
            mapper.detect(description);
        } catch (final LangDetectException e) {
            System.out.println("\n title with error is - " + title);
            System.out.println("\n description with error is - "
                    + description);
            e.printStackTrace();
            /*
             * String title = mapobject.get("title").toString(); language =
             * mapobject.get("lang") + ""; language = language.trim(); if
             * (language.trim().equalsIgnoreCase("hi") ||
             * language.trim().startsWith("en") ||
             * language.trim().equalsIgnoreCase("lt")) { String[] titleArr =
             * title.trim().split(" "); int i = 0; for (String titlePart :
             * titleArr) { if
             * (titlePart.trim().matches("^[\\u0900-\\u097F].*")) { i++; } }
             * if (i >= titleArr.length * 0.5) { mapobject.put("lang",
             * "hi"); } else { mapobject.put("lang", "lt"); } return true; }
             */
            return false;
        }

        if (language.trim().equalsIgnoreCase("hi")
                || language.trim().startsWith("en")
                || language.trim().equalsIgnoreCase("lt")) {
            mapobject.put("lang", language);
            return true;
        }

        return false;
    }

    private String detect(final String text) throws LangDetectException {
        final Detector detector = DetectorFactory.create();
        detector.append(text);
        return detector.detect();
    }

    /**
     * Checks whether to include the doc based on their title and get the
     * title from anchor tag title to choose the title that has largest
     * number of the words and in hindi and it also gets the image from
     * anchor tag href attribute
     *
     * @param mapobject
     *            of the key value pair
     * @return {@link Boolean}
     */
    private boolean checkTitleImage(final Map<String, Object> mapobject) {
        final TreeSet<String> set = new TreeSet<String>(new SetSort());

        final Gson gson = new Gson();
        JsonArray array = null;
        JsonObject object2 = null;

        if (mapobject.containsKey("anchor")
                && mapobject.get("anchor") != null) {
            final String arr = (String) mapobject.get("anchor");

            try {

                array = gson.fromJson(arr, JsonArray.class);

                for (final JsonElement jsonElement : array) {

                    try {
                        object2 = gson.fromJson(jsonElement.getAsString(),
                                JsonObject.class);
                    } catch (final Exception e) {

                        if (object2 == null) {
                            object2 = new JsonObject();
                            object2.addProperty("title",
                                    jsonElement.getAsString());
                            object2.addProperty("href", "");
                            object2.addProperty("alt", "");
                        }
                    }
                    if (object2 != null) {
                        assignTitleImage(mapobject, set, object2);
                    }
                    object2 = null;
                }
            } catch (final ClassCastException e) {

                object2 = gson.fromJson(arr, JsonObject.class);
                assignTitleImage(mapobject, set, object2);

            } catch (final Exception e) {
                e.printStackTrace();
            }

            if (!set.isEmpty()) {
                int loop = 0;
                final List<String> tempList = new LinkedList<String>();
                for (final String string : set) {
                    final String title = string;
                    tempList.add(title.trim());
                    loop++;
                    if (loop == 2) {
                        break;
                    }
                }
                if (!tempList.isEmpty()) {
                    if (tempList.get(0).matches("^[\\u0900-\\u097F].*")) {
                        mapobject.put("title", tempList.get(0));
                    } else if (tempList.size() > 1
                            && !(tempList.get(0)
                                    .matches("^[\\u0900-\\u097F].*"))
                            && tempList.get(1).matches(
                                    "^[\\u0900-\\u097F].*")) {
                        mapobject.put("title", tempList.get(1));
                    } else {
                        mapobject.put("title", tempList.get(0));
                    }
                }
            }

        }
        if (mapobject.containsKey("title")
                && mapobject.get("title").toString().length() > 0
                && mapobject.get("title").toString().split(" ").length > 2
                && mapobject.get("title").toString().split(" ").length < 20
                && !mapobject.get("title").toString().contains("<")) {
            if (set.isEmpty()) {
                mapobject.put("title",
                        getTitleRefined(mapobject.get("title") + ""));
            }
            return true;
        }
        return false;
    }

    /**
     * @param mapobject
     * @param set
     * @param object2
     */
    private void assignTitleImage(final Map<String, Object> mapobject,
            final TreeSet<String> set, final JsonObject object2) {
        if (!mapobject.containsKey("ImgH1")
                && !mapobject.containsKey("ImgH2")) {
            if (object2.get("href") != null
                    && object2.get("href").getAsString().length() > 0
                    && (object2.get("href").getAsString().toLowerCase()
                            .contains(".jpg")
                            || object2.get("href").getAsString()
                                    .toLowerCase().contains(".jpeg") || object2
                            .get("href").getAsString().toLowerCase()
                            .contains(".gif"))) {
                putImages(mapobject, object2.get("href").getAsString()
                        .trim(), mapobject.get("tags").toString().trim()
                        .toLowerCase());
            }
        }

        if (object2.get("title") != null
                && object2.get("title").getAsString().length() > 0
                && object2.get("title").getAsString().split(" ").length > 2
                && object2.get("title").getAsString().split(" ").length < 20
                && !object2.get("title").getAsString().contains("<")) {

            final String newTitle = getTitleRefined(object2.get("title")
                    .getAsString());
            set.add(newTitle.trim());
        }
    }

    /**
     * This function used to refine the title based on specific bad keyword
     * during observation
     *
     * @param title
     * @return refined title
     */
    private String getTitleRefined(String title) {
        title = title.replaceAll("\u0027", "");
        title = title.replaceAll("\u0026", "");
        title = title.replaceAll("\u003d", "");
        if (title.contains("-")) {
            if (title.trim().split("-").length > 1
                    && !title.trim().split("-")[1].trim().matches(
                            "^[\\u0900-\\u097F].*")) {
                return title.trim().split("-")[0].trim();
            }
        } else if (title.contains(":")) {
            if (!title.trim().split(":")[0].trim().matches(
                    "^[\\u0900-\\u097F].*")
                    && title.trim().split(":").length > 1) {
                return title.trim().split(":")[1].trim();
            }
        }
        return title;
    }

    /**
     * Creates the path for the images
     *
     * @param map
     *            of the key value pair
     * @param imageUrl
     * @param category
     */
    private void putImages(final Map<String, Object> map2,
            final String imageUrl, final String category) {
        try {
            map2.put("ImgSrc", StringEscapeUtils.unescapeHtml(imageUrl)
                    .trim());
            if (map2.containsKey("ImgSrc") && map2.get("ImgSrc") != null
                    && map2.get("ImgSrc").toString().length() > 0) {
                map2.put(
                        "ImgSrc",
                        StringEscapeUtils.unescapeHtml(map2.get("ImgSrc")
                                .toString())
                                + "##RAFTAAR##"
                                + imageUrl.trim());
            } else {
                return;
            }
            String imgNamearr[] = null;
            try {
                imgNamearr = imageUrl.split("/");
            } catch (final Exception e) {
                e.printStackTrace();
            }
            String imgName = null;
            try {
                imgName = imgNamearr[imgNamearr.length - 1];
            } catch (final Exception e) {
                e.printStackTrace();
            }
            final String imagePath = "/"
                    + String.valueOf(imgName.charAt(0));
            imgName = imgName.replaceAll(" ", "_").replaceAll("%20", "_");
            if (imgName.split(".jpg").length > 0) {
                imgName = imgName.split(".jpg")[0];
                imgName = imgName + ".jpg";
            }

            map2.put("ImgH1", "h1/" + category + imagePath + "/" + imgName);
            map2.put("ImgH2", "h2/" + category + imagePath + "/" + imgName);
        } catch (final Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Inserts the data to the elasticsearch
     *
     * @param mapobject
     * @param key
     *            unique id generally it is the unique url
     */
    public static void insertToElastic(final Map<String, Object> mapobject,
            final String key) {

        final Settings settings = ImmutableSettings.settingsBuilder()
                .put("cluster.name", CLUSTER).build();/*
                                                     * change ccluster.name
                                                     * to cluster
                                                     */
        final Client client = new TransportClient(settings)
                .addTransportAddress(new InetSocketTransportAddress(
                        SEARCH_HOST, Integer.parseInt(SEARCH_PORT)));
        client.prepareIndex(SEARCH_INDEX_NAME, SEARCHtYPE, key)
                .setSource(mapobject).execute().actionGet();
        client.close();
    }

    /**
     * Assign the city to the news without city
     *
     * @param category
     * @param description
     * @return update category with city
     */
    private static void assignCity(final Map<String, Object> mapobject) {

        String category = mapobject.get("tags").toString();
        if (category.endsWith("/")) {
            boolean flag = true;
            final String catArr[] = category.split("/");
            if (catArr.length == 2) {
                final String state = catArr[1];
                CITY_MAP = STATE_MAP.get(state);
                for (final Entry<String, String> e : CITY_MAP.entrySet()) {
                    final String description = mapobject.get("description")
                            .toString();
                    if (description.contains(e.getValue())) {
                        category = category + e.getKey();
                        mapobject.put("tags", category);
                        flag = false;
                        break;
                    }
                }
            }
            if (flag) {
                mapobject.put("tags", "national");
            }
        }
    }
}

/**
 * Update the data to hbase
 *
 * @param tableName
 * @param rowKey
 * @param family
 * @param qualifier
 * @param value
 * @param conf
 */
public static void updateIntoHbase(final String tableName,
        final String rowKey, final String family, final String qualifier,
        final String value, final Configuration conf) {
    HTable table = null;
    try {
        table = new HTable(conf, tableName);
    } catch (final IOException e) {
        e.printStackTrace();
    }
    final Put put = new Put(Bytes.toBytes(rowKey));
    put.add(Bytes.toBytes(family), Bytes.toBytes(qualifier),
            Bytes.toBytes(value));
    try {
        table.put(put);
        table.close();
    } catch (final IOException e) {
        e.printStackTrace();
    }
}

/**
 * Return the map of the all states and city
 *
 * @param fileName
 * @return
 */
private static Map<String, Map<String, String>> returnMap(
        final String fileName) {
    final Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
    BufferedReader br = null;
    try {
        br = new BufferedReader(new FileReader(fileName));
        String line;
        while ((line = br.readLine()) != null) {

            final String arr[] = line.split("\t", 3);
            if (arr.length == 3) {
                if (map.containsKey(arr[0])) {
                    Map<String, String> m = new HashMap<String, String>();
                    m = map.get(arr[0]);
                    m.put(arr[1], arr[2]);
                } else {
                    final Map<String, String> m = new HashMap<String, String>();
                    m.put(arr[1], arr[2]);
                    map.put(arr[0], m);
                }
            }
        }
    } catch (final FileNotFoundException e) {
        e.printStackTrace();
    } catch (final IOException e) {
        e.printStackTrace();
    } finally {
        if (br != null) {
            try {
                br.close();
            } catch (final Exception e) {
                e.printStackTrace();
            }
        }
    }
    return map;
}


public static void main(final String[] args) throws Exception {

    int c = 0;
    c = ToolRunner.run(new Configuration(), new HbaseToElastic(), args);
    System.exit(c);
}
}