@article{Pedersen_Holsbø_Andersen_Shvetsov_Ravn_Sommerseth_Bongo_2022, title={Lessons Learned Developing and Using a Machine Learning Model to Automatically Transcribe 2.3 Million Handwritten Occupation Codes}, volume={12}, url={https://hlcs.nl/article/view/11331}, DOI={10.51964/hlcs11331}, abstractNote={<p>Machine learning approaches achieve high accuracy for text recognition and are therefore increasingly used for the transcription of handwritten historical sources. However, using machine learning in production requires a streamlined end-to-end pipeline that scales to the dataset size and a model that achieves high accuracy with few manual transcriptions. The correctness of the model results must also be verified. This paper describes our lessons learned developing, tuning and using the <em>Occode</em> end-to-end machine learning pipeline for transcribing 2.3 million handwritten occupation codes from the Norwegian 1950 population census. We achieve an accuracy of 97% for the automatically transcribed codes, and we send 3% of the codes for manual verification . We verify that the occupation code distribution found in our results matches the distribution found in our training data, which should be representative for the census as a whole. We believe our approach and lessons learned may be useful for other transcription projects that plan to use machine learning in production. The source code is available at <a href="https://github.com/uit-hdl/rhd-codes">https://github.com/uit-hdl/rhd-codes</a>.</p>}, journal={Historical Life Course Studies}, author={Pedersen, Bjørn-Richard and Holsbø, Einar and Andersen, Trygve and Shvetsov, Nikita and Ravn, Johan and Sommerseth, Hilde Leikny and Bongo, Lars Ailo}, year={2022}, month={Jan.}, pages={1–17} }