PHP前端开发

csv2mysql

百变鹏仔 3小时前 #Python
文章标签 csv2mysql
<span style="color: #0000ff">import</span><span style="color: #000000"> os</span><span style="color: #0000ff">import</span><span style="color: #000000"> re</span><span style="color: #0000ff">import</span><span style="color: #000000"> sys</span><span style="color: #0000ff">import</span><span style="color: #000000"> csv</span><span style="color: #0000ff">import</span><span style="color: #000000"> time</span><span style="color: #0000ff">import</span><span style="color: #000000"> argparse</span><span style="color: #0000ff">import</span><span style="color: #000000"> collections</span><span style="color: #0000ff">import</span><span style="color: #000000"> MySQLdb</span><span style="color: #0000ff">import</span><span style="color: #000000"> warnings </span><span style="color: #008000">#</span><span style="color: #008000"> suppress annoying mysql warnings</span>warnings.filterwarnings(action=<span style="color: #800000">'</span><span style="color: #800000">ignore</span><span style="color: #800000">'</span>, category=<span style="color: #000000">MySQLdb.Warning) </span><span style="color: #0000ff">def</span><span style="color: #000000"> get_type(s):    </span><span style="color: #800000">"""</span><span style="color: #800000">Find type for this string    </span><span style="color: #800000">"""</span>    <span style="color: #008000">#</span><span style="color: #008000"> try integer type</span>    <span style="color: #0000ff">try</span><span style="color: #000000">:        v </span>=<span style="color: #000000"> int(s)    </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:        </span><span style="color: #0000ff">pass</span>    <span style="color: #0000ff">else</span><span style="color: #000000">:        </span><span style="color: #0000ff">if</span> abs(v) > 2147483647<span style="color: #000000">:            </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">bigint</span><span style="color: #800000">'</span>        <span style="color: #0000ff">else</span><span style="color: #000000">:            </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">int</span><span style="color: #800000">'</span>    <span style="color: #008000">#</span><span style="color: #008000"> try float type</span>    <span style="color: #0000ff">try</span><span style="color: #000000">:        float(s)    </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:        </span><span style="color: #0000ff">pass</span>    <span style="color: #0000ff">else</span><span style="color: #000000">:        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">double</span><span style="color: #800000">'</span>    <span style="color: #008000">#</span><span style="color: #008000"> check for timestamp</span>    dt_formats =<span style="color: #000000"> (        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d %H:%M:%S</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">datetime</span><span style="color: #800000">'</span><span style="color: #000000">),        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d %H:%M:%S.%f</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">datetime</span><span style="color: #800000">'</span><span style="color: #000000">),        (</span><span style="color: #800000">'</span><span style="color: #800000">%Y-%m-%d</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">date</span><span style="color: #800000">'</span><span style="color: #000000">),        (</span><span style="color: #800000">'</span><span style="color: #800000">%H:%M:%S</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">time</span><span style="color: #800000">'</span><span style="color: #000000">),    )    </span><span style="color: #0000ff">for</span> dt_format, dt_type <span style="color: #0000ff">in</span><span style="color: #000000"> dt_formats:        </span><span style="color: #0000ff">try</span><span style="color: #000000">:            time.strptime(s, dt_format)        </span><span style="color: #0000ff">except</span><span style="color: #000000"> ValueError:            </span><span style="color: #0000ff">pass</span>        <span style="color: #0000ff">else</span><span style="color: #000000">:            </span><span style="color: #0000ff">return</span><span style="color: #000000"> dt_type       </span><span style="color: #008000">#</span><span style="color: #008000"> doesn't match any other types so assume text</span>    <span style="color: #0000ff">if</span> len(s) > 255<span style="color: #000000">:        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">text</span><span style="color: #800000">'</span>    <span style="color: #0000ff">else</span><span style="color: #000000">:        </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">varchar(255)</span><span style="color: #800000">'</span><span style="color: #0000ff">def</span> most_common(l, default=<span style="color: #800000">'</span><span style="color: #800000">varchar(255)</span><span style="color: #800000">'</span><span style="color: #000000">):    </span><span style="color: #800000">"""</span><span style="color: #800000">Return most common value from list    </span><span style="color: #800000">"""</span>    <span style="color: #008000">#</span><span style="color: #008000"> some formats trump others</span>    <span style="color: #0000ff">if</span><span style="color: #000000"> l:        </span><span style="color: #0000ff">for</span> dt_type <span style="color: #0000ff">in</span> (<span style="color: #800000">'</span><span style="color: #800000">text</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">bigint</span><span style="color: #800000">'</span><span style="color: #000000">):            </span><span style="color: #0000ff">if</span> dt_type <span style="color: #0000ff">in</span><span style="color: #000000"> l:                </span><span style="color: #0000ff">return</span><span style="color: #000000"> dt_type        </span><span style="color: #0000ff">return</span> max(l, key=<span style="color: #000000">l.count)    </span><span style="color: #0000ff">return</span><span style="color: #000000"> default</span><span style="color: #0000ff">def</span> get_col_types(input_file, max_rows=1000<span style="color: #000000">):    </span><span style="color: #800000">"""</span><span style="color: #800000">Find the type for each CSV column    </span><span style="color: #800000">"""</span><span style="color: #000000">    csv_types </span>=<span style="color: #000000"> collections.defaultdict(list)    reader </span>=<span style="color: #000000"> csv.reader(open(input_file))    </span><span style="color: #008000">#</span><span style="color: #008000"> test the first few rows for their data types</span>    <span style="color: #0000ff">for</span> row_i, row <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(reader):        </span><span style="color: #0000ff">if</span> row_i ==<span style="color: #000000"> 0:            header </span>=<span style="color: #000000"> row        </span><span style="color: #0000ff">else</span><span style="color: #000000">:            </span><span style="color: #0000ff">for</span> col_i, s <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(row):                data_type </span>=<span style="color: #000000"> get_type(s)                csv_types[header[col_i]].append(data_type)         </span><span style="color: #0000ff">if</span> row_i ==<span style="color: #000000"> max_rows:            </span><span style="color: #0000ff">break</span>    <span style="color: #008000">#</span><span style="color: #008000"> take the most common data type for each row</span>    <span style="color: #0000ff">return</span> [most_common(csv_types[col]) <span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> header]</span><span style="color: #0000ff">def</span><span style="color: #000000"> get_schema(table, header, col_types):    </span><span style="color: #800000">"""</span><span style="color: #800000">Generate the schema for this table from given types and columns    </span><span style="color: #800000">"""</span><span style="color: #000000">    schema_sql </span>= <span style="color: #800000">"""</span><span style="color: #800000">CREATE TABLE IF NOT EXISTS %s (         id int NOT NULL AUTO_INCREMENT,</span><span style="color: #800000">"""</span> %<span style="color: #000000"> table     </span><span style="color: #0000ff">for</span> col_name, col_type <span style="color: #0000ff">in</span><span style="color: #000000"> zip(header, col_types):        schema_sql </span>+= <span style="color: #800000">'</span><span style="color: #800000">%s %s,</span><span style="color: #800000">'</span> %<span style="color: #000000"> (col_name, col_type)    schema_sql </span>+= <span style="color: #800000">"""</span><span style="color: #800000">PRIMARY KEY (id)        ) DEFAULT CHARSET=utf8;</span><span style="color: #800000">"""</span>    <span style="color: #0000ff">return</span><span style="color: #000000"> schema_sql</span><span style="color: #0000ff">def</span><span style="color: #000000"> get_insert(table, header):    </span><span style="color: #800000">"""</span><span style="color: #800000">Generate the SQL for inserting rows    </span><span style="color: #800000">"""</span><span style="color: #000000">    field_names </span>= <span style="color: #800000">'</span><span style="color: #800000">, </span><span style="color: #800000">'</span><span style="color: #000000">.join(header)    field_markers </span>= <span style="color: #800000">'</span><span style="color: #800000">, </span><span style="color: #800000">'</span>.join(<span style="color: #800000">'</span><span style="color: #800000">%s</span><span style="color: #800000">'</span> <span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> header)    </span><span style="color: #0000ff">return</span> <span style="color: #800000">'</span><span style="color: #800000">INSERT INTO %s (%s) VALUES (%s);</span><span style="color: #800000">'</span> %<span style="color: #000000">         (table, field_names, field_markers)</span><span style="color: #0000ff">def</span><span style="color: #000000"> format_header(row):    </span><span style="color: #800000">"""</span><span style="color: #800000">Format column names to remove illegal characters and duplicates    </span><span style="color: #800000">"""</span><span style="color: #000000">    safe_col </span>= <span style="color: #0000ff">lambda</span> s: re.sub(<span style="color: #800000">'</span><span style="color: #800000">W+</span><span style="color: #800000">'</span>, <span style="color: #800000">'</span><span style="color: #800000">_</span><span style="color: #800000">'</span>, s.lower()).strip(<span style="color: #800000">'</span><span style="color: #800000">_</span><span style="color: #800000">'</span><span style="color: #000000">)    header </span>=<span style="color: #000000"> []    counts </span>=<span style="color: #000000"> collections.defaultdict(int)    </span><span style="color: #0000ff">for</span> col <span style="color: #0000ff">in</span><span style="color: #000000"> row:        col </span>=<span style="color: #000000"> safe_col(col)        counts[col] </span>+= 1        <span style="color: #0000ff">if</span> counts[col] > 1<span style="color: #000000">:            col </span>= <span style="color: #800000">'</span><span style="color: #800000">{}{}</span><span style="color: #800000">'</span><span style="color: #000000">.format(col, counts[col])        header.append(col)    </span><span style="color: #0000ff">return</span><span style="color: #000000"> header</span><span style="color: #0000ff">def</span> main(input_file, user, password, host, table, database, max_inserts=10000<span style="color: #000000">):    </span><span style="color: #0000ff">print</span> <span style="color: #800000">"</span><span style="color: #800000">Importing `%s' into MySQL database `%s.%s'</span><span style="color: #800000">"</span> %<span style="color: #000000"> (input_file, database, table)    db </span>= MySQLdb.connect(host=host, user=user, passwd=password, charset=<span style="color: #800000">'</span><span style="color: #800000">utf8</span><span style="color: #800000">'</span><span style="color: #000000">)    cursor </span>=<span style="color: #000000"> db.cursor()    </span><span style="color: #008000">#</span><span style="color: #008000"> create database and if doesn't exist</span>    cursor.execute(<span style="color: #800000">'</span><span style="color: #800000">CREATE DATABASE IF NOT EXISTS %s;</span><span style="color: #800000">'</span> %<span style="color: #000000"> database)    db.select_db(database)    </span><span style="color: #008000">#</span><span style="color: #008000"> define table</span>    <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Analyzing column types ...</span><span style="color: #800000">'</span><span style="color: #000000">    col_types </span>=<span style="color: #000000"> get_col_types(input_file)    </span><span style="color: #0000ff">print</span><span style="color: #000000"> col_types    header </span>=<span style="color: #000000"> None    </span><span style="color: #0000ff">for</span> i, row <span style="color: #0000ff">in</span><span style="color: #000000"> enumerate(csv.reader(open(input_file))):        </span><span style="color: #0000ff">if</span><span style="color: #000000"> header:            </span><span style="color: #0000ff">while</span> len(row) <<span style="color: #000000"> len(header):                row.append(</span><span style="color: #800000">''</span>) <span style="color: #008000">#</span><span style="color: #008000"> this row is missing columns so pad blank values</span><span style="color: #000000">            cursor.execute(insert_sql, row)            </span><span style="color: #0000ff">if</span> i % max_inserts ==<span style="color: #000000"> 0:                db.commit()                </span><span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">commit</span><span style="color: #800000">'</span>        <span style="color: #0000ff">else</span><span style="color: #000000">:            header </span>=<span style="color: #000000"> format_header(row)            schema_sql </span>=<span style="color: #000000"> get_schema(table, header, col_types)            </span><span style="color: #0000ff">print</span><span style="color: #000000"> schema_sql            </span><span style="color: #008000">#</span><span style="color: #008000"> create table</span>            cursor.execute(<span style="color: #800000">'</span><span style="color: #800000">DROP TABLE IF EXISTS %s;</span><span style="color: #800000">'</span> %<span style="color: #000000"> table)            cursor.execute(schema_sql)            </span><span style="color: #008000">#</span><span style="color: #008000"> create index for more efficient access</span>            <span style="color: #0000ff">try</span><span style="color: #000000">:                cursor.execute(</span><span style="color: #800000">'</span><span style="color: #800000">CREATE INDEX ids ON %s (id);</span><span style="color: #800000">'</span> %<span style="color: #000000"> table)            </span><span style="color: #0000ff">except</span><span style="color: #000000"> MySQLdb.OperationalError:                </span><span style="color: #0000ff">pass</span> <span style="color: #008000">#</span><span style="color: #008000"> index already exists</span>            <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Inserting rows ...</span><span style="color: #800000">'</span>            <span style="color: #008000">#</span><span style="color: #008000"> SQL string for inserting data</span>            insert_sql =<span style="color: #000000"> get_insert(table, header)    </span><span style="color: #008000">#</span><span style="color: #008000"> commit rows to database</span>    <span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Committing rows to database ...</span><span style="color: #800000">'</span><span style="color: #000000">    db.commit()    </span><span style="color: #0000ff">print</span> <span style="color: #800000">'</span><span style="color: #800000">Done!</span><span style="color: #800000">'</span><span style="color: #0000ff">if</span> <span style="color: #800080">__name__</span> == <span style="color: #800000">'</span><span style="color: #800000">__main__</span><span style="color: #800000">'</span><span style="color: #000000">:    parser </span>= argparse.ArgumentParser(description=<span style="color: #800000">'</span><span style="color: #800000">Automatically insert CSV contents into MySQL</span><span style="color: #800000">'</span><span style="color: #000000">)    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--table</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">table</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">Set the name of the table. If not set the CSV filename will be used</span><span style="color: #800000">'</span><span style="color: #000000">)    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--database</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">database</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">test</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">Set the name of the database. If not set the test database will be used</span><span style="color: #800000">'</span><span style="color: #000000">)    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--user</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">user</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">root</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL login username</span><span style="color: #800000">'</span><span style="color: #000000">)    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--password</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">password</span><span style="color: #800000">'</span>, default=<span style="color: #800000">''</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL login password</span><span style="color: #800000">'</span><span style="color: #000000">)    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">--host</span><span style="color: #800000">'</span>, dest=<span style="color: #800000">'</span><span style="color: #800000">host</span><span style="color: #800000">'</span>, default=<span style="color: #800000">'</span><span style="color: #800000">localhost</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The MySQL host</span><span style="color: #800000">'</span><span style="color: #000000">)    parser.add_argument(</span><span style="color: #800000">'</span><span style="color: #800000">input_file</span><span style="color: #800000">'</span>, help=<span style="color: #800000">'</span><span style="color: #800000">The input CSV file</span><span style="color: #800000">'</span><span style="color: #000000">)    args </span>= parser.parse_args(sys.argv[1<span style="color: #000000">:])    </span><span style="color: #0000ff">if</span> <span style="color: #0000ff">not</span><span style="color: #000000"> args.table:        </span><span style="color: #008000">#</span><span style="color: #008000"> use input file name for table</span>        args.table =<span style="color: #000000"> os.path.splitext(os.path.basename(args.input_file))[0]        main(args.input_file, args.user, args.password, args.host, args.table, args.database)</span>

具体使用例子如下

[root@server1]# python csv2mysql.py --host=172.20.197.61 --user=PdYRxGWNpVRCQfHj --password=RX5a5YsViQcDdywr --database=cf_dc61100a_92a7_43ca_81dd_2a7e3fa0808a --table=performance_history_2 performance_history_2.csv
Importing `performance_history_2.csv' into MySQL database `cf_dc61100a_92a7_43ca_81dd_2a7e3fa0808a.performance_history_2'
Analyzing column types ...
['datetime', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)', 'varchar(255)']
CREATE TABLE IF NOT EXISTS performance_history_2 (
id int NOT NULL AUTO_INCREMENT,
date_time datetime,
write_bw_mb_s varchar(255),
read_bw_mb_s varchar(255),
write_iops varchar(255),
read_iops varchar(255),
write_latency_usec varchar(255),
read_latency_usec varchar(255),
avg_latency_usec varchar(255),
PRIMARY KEY (id)
) DEFAULT CHARSET=utf8;
Inserting rows ...
commit
commit
commit
commit
commit
commit
commit
commit
commit
commit
commit
Committing rows to database ...
Done!