Jump to content

User:Elukey/Analytics/Hadoop

From Wikitech
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Things to remember when creating a cluster from scratch

Hadoop

sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /user/oozie
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /user/hive
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/camus
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/refinery
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/camus/webrequest-00
sudo -u analytics kerberos-run-command analytics hdfs dfs -mkdir /user/analytics
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /user/history
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/wmf
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/event
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/event
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/webrequest
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/eventlogging
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/webrequests_data_loss
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/raw/webrequests_faulty_hosts
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/webrequest
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -mkdir /wmf/data/eventlogging
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown yarn:hadoop /user/history
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/event
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics-privatedata-users /wmf/data/raw/webrequest
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/eventlogging
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/event
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics-privatedata-users /wmf/data/webrequest
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/eventlogging
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/webrequests_faulty_hosts
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/webrequests_data_loss
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chown analytics:analytics /wmf/data/raw/webrequest
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod 1777 /user/history
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod 1777 /tmp

Hive

sudo -u hive /usr/lib/hive/bin/schematool -dbType mysql -initSchema
sudo -u analytics kerberos-run-command analytics beeline
  create database wmf_raw; create database event; create database wmf;

Then create the tables contained in refinery hive:

REMEMBER: change the target hdfs from prod to test in every script before executing them.

sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod 1777 /user/hive/warehouse/wmf_raw.db
sudo -u hdfs kerberos-run-command hdfs hdfs dfs -chmod 1777 /user/hive/warehouse/wmf.db

sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_raw_table.hql --database wmf_raw
sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_sequence_stats_hourly_table.hql --database wmf_raw
sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_sequence_stats_table.hql --database wmf_raw
sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_subset_table.hql --database wmf (?)
sudo -u analytics kerberos-run-command analytics hive -f create_webrequest_table.hql --database wmf

Refinery

Deploy the refinery to HDFS as usual, no changes or cherry picks

# after git clone refinery on a host, like an-tool1006
cd refinery
git fat init
git fat pull
cherry pick https://gerrit.wikimedia.org/r/#/c/analytics/refinery/+/491791/
cd oozie
# replace analytics-alerts@ with your email
for filename in `grep -l -r analytics-alerts | cut -f1 -d " "`; do sed -e 's/analytics-alerts@/ltoscano@/g' -i $filename; done
cd ..
hdfs dfs -copyFromLocal oozie /user/elukey/oozie

# Then deploy refinery to HDFS following the regular team's procedures.

# Note:
# - check in bundle_test.properties that the refinery hive jar referenced is on hdfs
sudo -u analytics kerberos-run-command analytics oozie job \
-Duser=analytics \
-Dstart_time=2020-04-01T11:00Z \
-Derror_incomplete_data_threshold=100 \
-Dwarning_incomplete_data_threshold=100 \
-Derror_data_loss_threshold=100 \
-Dwarning_data_loss_threshold=100 \
-Dqueue_name=production \
-Doozie_directory=hdfs://analytics-test-hadoop/user/elukey/oozie \
-Drefinery_directory=hdfs://analytics-test-hadoop$(sudo -u analytics kerberos-run-command analytics hdfs dfs -ls -d /wmf/refinery/2020* | tail -n 1 | awk '{print $NF}') \
-oozie $OOZIE_URL -run -config /home/elukey/refinery/oozie/webrequest/load/bundle_test.properties

Hue

sudo -u hue /usr/lib/hue/build/env/bin/hue syncdb
sudo -u hue /usr/lib/hue/build/env/bin/hue migrate