Redshift副本从analyze创建不同的压缩编码

时间:2017-07-14 01:38:16

标签: sql amazon-s3 amazon-redshift sqlbulkcopy data-compression

我注意到AWS Redshift建议在将数据(通过COPY)加载到空表时自动创建的列压缩编码不同。

例如,我创建了一个表并从S3加载数据,如下所示:

public class MapsActivity extends FragmentActivity implements OnMapReadyCallback {
    HashMap<String, Double> resultp;
    private String TAG = MapsActivity.class.getSimpleName();
    private ProgressDialog pDialog;
    private static String url = "https://api.myjson.com/bins/1an69r";
    ArrayList<HashMap<String, String>> placeList;
    ArrayList<HashMap<String, Double>> Coordinate;
    private GoogleMap mMap;
    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_maps);
        new GetPlace().execute();
       placeList = new ArrayList<>();
        Coordinate = new ArrayList<>();
         resultp = new HashMap<>();
        SupportMapFragment mapFragment = (SupportMapFragment) getSupportFragmentManager()
                .findFragmentById(R.id.map);
        mapFragment.getMapAsync(this);

    }
    private class GetPlace extends AsyncTask<Void, Void, Void> {

            @Override
            protected void onPreExecute() {
                super.onPreExecute();

                pDialog = new ProgressDialog(MapsActivity.this);
                pDialog.setMessage("wait bro");
                pDialog.setCancelable(false);
                pDialog.show();

            }

            @Override
            protected Void doInBackground(Void... arg0) {
                HttpHandler sh = new HttpHandler();


                String jsonStr = sh.makeServiceCall(url);

                Log.e(TAG, "Response from url: " + jsonStr);

                if (jsonStr != null) {
                    try {
                        JSONObject jsonObj = new JSONObject(jsonStr);

                        // Getting JSON Array node
                        JSONArray places = jsonObj.getJSONArray("places");

                        // looping through All places
                        for (int i = 0; i < places.length(); i++) {
                            JSONObject c = places.getJSONObject(i);

                            String id = c.getString("id");
                            String name = c.getString("name");
                            String city = c.getString("city");
                            String needle = c.getString("needle");
                            Double lat = c.getDouble("lat");
                            Double lng = c.getDouble("lng");
                            String rating = c.getString("rating");


                            // tmp hash map for single contact
                            HashMap<String, String> place = new HashMap<>();

                            // adding each child node to HashMap key => value
                            place.put("id", id);
                            place.put("name", name);
                            place.put("city", city);
                            place.put("needle", needle );
                            place.put("rating",rating);
                            placeList.add(place);
                            //adding to new hashmap 
                            HashMap<String, Double> lace = new HashMap<>();
                            lace.put("lat",lat);
                            lace.put("lng",lng);
                            // adding contact to place list
                            Coordinate.add(lace);

                        }
                    } catch (final JSONException e) {
                        Log.e(TAG, "Json parsing error: " + e.getMessage());
                        runOnUiThread(new Runnable() {
                            @Override
                            public void run() {
                                Toast.makeText(getApplicationContext(),
                                        "Json parsing error: " + e.getMessage(),
                                        Toast.LENGTH_LONG)
                                        .show();
                            }
                        });

                    }
                } else {
                    Log.e(TAG, "Couldn't get json from server.");
                    runOnUiThread(new Runnable() {
                        @Override
                        public void run() {
                            Toast.makeText(getApplicationContext(),
                                    "Couldn't get json from server. Check LogCat for possible errors!",
                                    Toast.LENGTH_LONG)
                                    .show();
                        }
                    });

                }

                return null;
            }

            @Override
            protected void onPostExecute(Void result) {
                super.onPostExecute(result);
                // Dismiss the progress dialog
                if (pDialog.isShowing())
                    pDialog.dismiss();


            }

        }

    **@Override
    public void onMapReady(GoogleMap googleMap) {
        mMap = googleMap;
   for( int i=0;i<Coordinate.size();i++)
        {

           resultp = Coordinate.get(i);
          Double lAT = resultp.get("lat");
            Double lNG = resultp.get("lng");

            //Plot latitude and longitude in the map

            LatLng lk = new LatLng(lAT,lNG);

            googleMap.addMarker(new MarkerOptions().position(new    LatLng(lAT, lNG)));
            Log.e("PlaceLL",lAT+" "+lNG);

        }**
    }

}
  

表客户端创建的执行时间:0.3s

CREATE TABLE Client (Id varchar(511) , ClientId integer , CreatedOn timestamp, 
UpdatedOn timestamp ,  DeletedOn timestamp , LockVersion integer , RegionId 
varchar(511) , OfficeId varchar(511) , CountryId varchar(511) ,  
FirstContactDate timestamp , DidExistPre boolean , IsActive boolean , 
StatusReason integer ,  CreatedById varchar(511) , IsLocked boolean , 
LockType integer , KeyWorker varchar(511) ,  InactiveDate timestamp , 
Current_Flag varchar(511) );
  

警告:加载到表格&#39;客户端&#39;完成,24284条记录   加载成功。加载到表格&#39;客户端&#39;完成,6   记录载有为ACCEPTINVCHARS替换的记录。校验   &#39; stl_replacements&#39;系统表了解详情。

     

0行影响COPY成功执行

     

执行时间:3.39s

完成此操作后,我可以查看COPY应用的列压缩编码:

copy Client from 's3://<bucket-name>/<folder>/Client.csv' 
credentials 'aws_access_key_id=<access key>; aws_secret_access_key=<secret>' 
csv fillrecord truncatecolumns ignoreheader 1 timeformat as 'YYYY-MM-
DDTHH:MI:SS' gzip acceptinvchars compupdate on region 'ap-southeast-2';    

,并提供:

select "column", type, encoding, distkey, sortkey, "notnull" 
from pg_table_def where tablename = 'client';

我可以这样做:

╔══════════════════╦═════════════════════════════╦═══════╦═══════╦═══╦═══════╗
║ id               ║ character varying(511)      ║ lzo   ║ false ║ 0 ║ false ║
║ clientid         ║ integer                     ║ delta ║ false ║ 0 ║ false ║
║ createdon        ║ timestamp without time zone ║ lzo   ║ false ║ 0 ║ false ║
║ updatedon        ║ timestamp without time zone ║ lzo   ║ false ║ 0 ║ false ║
║ deletedon        ║ timestamp without time zone ║ none  ║ false ║ 0 ║ false ║
║ lockversion      ║ integer                     ║ delta ║ false ║ 0 ║ false ║
║ regionid         ║ character varying(511)      ║ lzo   ║ false ║ 0 ║ false ║
║ officeid         ║ character varying(511)      ║ lzo   ║ false ║ 0 ║ false ║
║ countryid        ║ character varying(511)      ║ lzo   ║ false ║ 0 ║ false ║
║ firstcontactdate ║ timestamp without time zone ║ lzo   ║ false ║ 0 ║ false ║
║ didexistprecirts ║ boolean                     ║ none  ║ false ║ 0 ║ false ║
║ isactive         ║ boolean                     ║ none  ║ false ║ 0 ║ false ║
║ statusreason     ║ integer                     ║ none  ║ false ║ 0 ║ false ║
║ createdbyid      ║ character varying(511)      ║ lzo   ║ false ║ 0 ║ false ║
║ islocked         ║ boolean                     ║ none  ║ false ║ 0 ║ false ║
║ locktype         ║ integer                     ║ lzo   ║ false ║ 0 ║ false ║
║ keyworker        ║ character varying(511)      ║ lzo   ║ false ║ 0 ║ false ║
║ inactivedate     ║ timestamp without time zone ║ lzo   ║ false ║ 0 ║ false ║
║ current_flag     ║ character varying(511)      ║ lzo   ║ false ║ 0 ║ false ║
╚══════════════════╩═════════════════════════════╩═══════╩═══════╩═══╩═══════╝

,并提供:

analyze compression client;

即。结果差异很大。

我很想知道为什么会这样?我得到了~24K记录小于AWS specifies所需的100K,这是有意义的压缩分析样本所需要的,但是对于相同的24K行表,COPY和ANALYZE给出不同的结果似乎很奇怪。

1 个答案:

答案 0 :(得分:2)

COPY当前不建议使用ZSTD,这就是为什么建议的压缩设置不同的原因。

如果要在要最大化压缩(使用最少空间)的永久表上应用压缩,则全面设置ZSTD将使您接近最佳压缩。

RAW在某些列上返回的原因是,在这种情况下,应用压缩(具有和不具有压缩的相同块数)没有优势。如果您知道表将会增长,那么也可以对这些列应用压缩。